]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / find_extrema_with_reduce.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
13
14 #include <algorithm>
15
16 #include <boost/compute/types.hpp>
17 #include <boost/compute/command_queue.hpp>
18 #include <boost/compute/algorithm/copy.hpp>
19 #include <boost/compute/allocator/pinned_allocator.hpp>
20 #include <boost/compute/container/vector.hpp>
21 #include <boost/compute/detail/meta_kernel.hpp>
22 #include <boost/compute/detail/iterator_range_size.hpp>
23 #include <boost/compute/detail/parameter_cache.hpp>
24 #include <boost/compute/memory/local_buffer.hpp>
25 #include <boost/compute/type_traits/type_name.hpp>
26 #include <boost/compute/utility/program_cache.hpp>
27
28 namespace boost {
29 namespace compute {
30 namespace detail {
31
32 template<class InputIterator>
33 bool find_extrema_with_reduce_requirements_met(InputIterator first,
34 InputIterator last,
35 command_queue &queue)
36 {
37 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
38
39 const device &device = queue.get_device();
40
41 // device must have dedicated local memory storage
42 // otherwise reduction would be highly inefficient
43 if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
44 {
45 return false;
46 }
47
48 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
49 // local memory size in bytes (per compute unit)
50 const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
51
52 std::string cache_key = std::string("__boost_find_extrema_reduce_")
53 + type_name<input_type>();
54 // load parameters
55 boost::shared_ptr<parameter_cache> parameters =
56 detail::parameter_cache::get_global_cache(device);
57
58 // Get preferred work group size
59 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
60
61 work_group_size = (std::min)(max_work_group_size, work_group_size);
62
63 // local memory size needed to perform parallel reduction
64 size_t required_local_mem_size = 0;
65 // indices size
66 required_local_mem_size += sizeof(uint_) * work_group_size;
67 // values size
68 required_local_mem_size += sizeof(input_type) * work_group_size;
69
70 // at least 4 work groups per compute unit otherwise reduction
71 // would be highly inefficient
72 return ((required_local_mem_size * 4) <= local_mem_size);
73 }
74
75 /// \internal_
76 /// Algorithm finds the first extremum in given range, i.e., with the lowest
77 /// index.
78 ///
79 /// If \p use_input_idx is false, it's assumed that input data is ordered by
80 /// increasing index and \p input_idx is not used in the algorithm.
81 template<class InputIterator, class ResultIterator, class Compare>
82 inline void find_extrema_with_reduce(InputIterator input,
83 vector<uint_>::iterator input_idx,
84 size_t count,
85 ResultIterator result,
86 vector<uint_>::iterator result_idx,
87 size_t work_groups_no,
88 size_t work_group_size,
89 Compare compare,
90 const bool find_minimum,
91 const bool use_input_idx,
92 command_queue &queue)
93 {
94 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
95
96 const context &context = queue.get_context();
97
98 meta_kernel k("find_extrema_reduce");
99 size_t count_arg = k.add_arg<uint_>("count");
100 size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
101 size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
102
103 k <<
104 // Work item global id
105 k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
106
107 // Index of element that will be read from input buffer
108 k.decl<uint_>("idx") << " = gid;\n" <<
109
110 k.decl<input_type>("acc") << ";\n" <<
111 k.decl<uint_>("acc_idx") << ";\n" <<
112 "if(gid < count) {\n" <<
113 // Real index of currently best element
114 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
115 k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
116 "#else\n" <<
117 k.var<uint_>("acc_idx") << " = idx;\n" <<
118 "#endif\n" <<
119
120 // Init accumulator with first[get_global_id(0)]
121 "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
122 "idx += get_global_size(0);\n" <<
123 "}\n" <<
124
125 k.decl<bool>("compare_result") << ";\n" <<
126 k.decl<bool>("equal") << ";\n\n" <<
127 "while( idx < count ){\n" <<
128 // Next element
129 k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
130 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
131 k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
132 "#endif\n" <<
133
134 // Comparison between currently best element (acc) and next element
135 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
136 "compare_result = " << compare(k.var<input_type>("next"),
137 k.var<input_type>("acc")) << ";\n" <<
138 "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
139 "equal = !compare_result && !" <<
140 compare(k.var<input_type>("acc"),
141 k.var<input_type>("next")) << ";\n" <<
142 "# endif\n" <<
143 "#else\n" <<
144 "compare_result = " << compare(k.var<input_type>("acc"),
145 k.var<input_type>("next")) << ";\n" <<
146 "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
147 "equal = !compare_result && !" <<
148 compare(k.var<input_type>("next"),
149 k.var<input_type>("acc")) << ";\n" <<
150 "# endif\n" <<
151 "#endif\n" <<
152
153 // save the winner
154 "acc = compare_result ? acc : next;\n" <<
155 "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
156 "acc_idx = compare_result ? " <<
157 "acc_idx : " <<
158 "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
159 "#else\n" <<
160 "acc_idx = compare_result ? acc_idx : idx;\n" <<
161 "#endif\n" <<
162 "idx += get_global_size(0);\n" <<
163 "}\n\n" <<
164
165 // Work item local id
166 k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
167 "block[lid] = acc;\n" <<
168 "block_idx[lid] = acc_idx;\n" <<
169 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
170
171 k.decl<uint_>("group_offset") <<
172 " = count - (get_local_size(0) * get_group_id(0));\n\n";
173
174 k <<
175 "#pragma unroll\n"
176 "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
177 "offset = offset / 2) {\n" <<
178 "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
179 k.decl<input_type>("mine") << " = block[lid];\n" <<
180 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
181 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
182 "compare_result = " << compare(k.var<input_type>("other"),
183 k.var<input_type>("mine")) << ";\n" <<
184 "equal = !compare_result && !" <<
185 compare(k.var<input_type>("mine"),
186 k.var<input_type>("other")) << ";\n" <<
187 "#else\n" <<
188 "compare_result = " << compare(k.var<input_type>("mine"),
189 k.var<input_type>("other")) << ";\n" <<
190 "equal = !compare_result && !" <<
191 compare(k.var<input_type>("other"),
192 k.var<input_type>("mine")) << ";\n" <<
193 "#endif\n" <<
194 "block[lid] = compare_result ? mine : other;\n" <<
195 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
196 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
197 "block_idx[lid] = compare_result ? " <<
198 "mine_idx : " <<
199 "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
200 "}\n"
201 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
202 "}\n\n" <<
203
204 // write block result to global output
205 "if(lid == 0){\n" <<
206 result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
207 result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
208 "}";
209
210 std::string options;
211 if(!find_minimum){
212 options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
213 }
214 if(use_input_idx){
215 options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
216 }
217
218 kernel kernel = k.compile(context, options);
219
220 kernel.set_arg(count_arg, static_cast<uint_>(count));
221 kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
222 kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
223
224 queue.enqueue_1d_range_kernel(kernel,
225 0,
226 work_groups_no * work_group_size,
227 work_group_size);
228 }
229
230 template<class InputIterator, class ResultIterator, class Compare>
231 inline void find_extrema_with_reduce(InputIterator input,
232 size_t count,
233 ResultIterator result,
234 vector<uint_>::iterator result_idx,
235 size_t work_groups_no,
236 size_t work_group_size,
237 Compare compare,
238 const bool find_minimum,
239 command_queue &queue)
240 {
241 // dummy will not be used
242 buffer_iterator<uint_> dummy = result_idx;
243 return find_extrema_with_reduce(
244 input, dummy, count, result, result_idx, work_groups_no,
245 work_group_size, compare, find_minimum, false, queue
246 );
247 }
248
249 template<class InputIterator, class Compare>
250 InputIterator find_extrema_with_reduce(InputIterator first,
251 InputIterator last,
252 Compare compare,
253 const bool find_minimum,
254 command_queue &queue)
255 {
256 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
257 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
258
259 const context &context = queue.get_context();
260 const device &device = queue.get_device();
261
262 // Getting information about used queue and device
263 const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
264 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
265
266 const size_t count = detail::iterator_range_size(first, last);
267
268 std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
269 + type_name<input_type>();
270
271 // load parameters
272 boost::shared_ptr<parameter_cache> parameters =
273 detail::parameter_cache::get_global_cache(device);
274
275 // get preferred work group size and preferred number
276 // of work groups per compute unit
277 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
278 size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
279
280 // calculate work group size and number of work groups
281 work_group_size = (std::min)(max_work_group_size, work_group_size);
282 size_t work_groups_no = compute_units_no * work_groups_per_cu;
283 work_groups_no = (std::min)(
284 work_groups_no,
285 static_cast<size_t>(std::ceil(float(count) / work_group_size))
286 );
287
288 // phase I: finding candidates for extremum
289
290 // device buffors for extremum candidates and their indices
291 // each work-group computes its candidate
292 vector<input_type> candidates(work_groups_no, context);
293 vector<uint_> candidates_idx(work_groups_no, context);
294
295 // finding candidates for first extremum and their indices
296 find_extrema_with_reduce(
297 first, count, candidates.begin(), candidates_idx.begin(),
298 work_groups_no, work_group_size, compare, find_minimum, queue
299 );
300
301 // phase II: finding extremum from among the candidates
302
303 // zero-copy buffers for final result (value and index)
304 vector<input_type, ::boost::compute::pinned_allocator<input_type> >
305 result(1, context);
306 vector<uint_, ::boost::compute::pinned_allocator<uint_> >
307 result_idx(1, context);
308
309 // get extremum from among the candidates
310 find_extrema_with_reduce(
311 candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
312 result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
313 );
314
315 // mapping extremum index to host
316 uint_* result_idx_host_ptr =
317 static_cast<uint_*>(
318 queue.enqueue_map_buffer(
319 result_idx.get_buffer(), command_queue::map_read,
320 0, sizeof(uint_)
321 )
322 );
323
324 return first + static_cast<difference_type>(*result_idx_host_ptr);
325 }
326
327 template<class InputIterator>
328 InputIterator find_extrema_with_reduce(InputIterator first,
329 InputIterator last,
330 ::boost::compute::less<
331 typename std::iterator_traits<
332 InputIterator
333 >::value_type
334 >
335 compare,
336 const bool find_minimum,
337 command_queue &queue)
338 {
339 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
340 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
341
342 const context &context = queue.get_context();
343 const device &device = queue.get_device();
344
345 // Getting information about used queue and device
346 const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
347 const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
348
349 const size_t count = detail::iterator_range_size(first, last);
350
351 std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
352 + type_name<input_type>();
353
354 // load parameters
355 boost::shared_ptr<parameter_cache> parameters =
356 detail::parameter_cache::get_global_cache(device);
357
358 // get preferred work group size and preferred number
359 // of work groups per compute unit
360 size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
361 size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
362
363 // calculate work group size and number of work groups
364 work_group_size = (std::min)(max_work_group_size, work_group_size);
365 size_t work_groups_no = compute_units_no * work_groups_per_cu;
366 work_groups_no = (std::min)(
367 work_groups_no,
368 static_cast<size_t>(std::ceil(float(count) / work_group_size))
369 );
370
371 // phase I: finding candidates for extremum
372
373 // device buffors for extremum candidates and their indices
374 // each work-group computes its candidate
375 // zero-copy buffers are used to eliminate copying data back to host
376 vector<input_type, ::boost::compute::pinned_allocator<input_type> >
377 candidates(work_groups_no, context);
378 vector<uint_, ::boost::compute::pinned_allocator <uint_> >
379 candidates_idx(work_groups_no, context);
380
381 // finding candidates for first extremum and their indices
382 find_extrema_with_reduce(
383 first, count, candidates.begin(), candidates_idx.begin(),
384 work_groups_no, work_group_size, compare, find_minimum, queue
385 );
386
387 // phase II: finding extremum from among the candidates
388
389 // mapping candidates and their indices to host
390 input_type* candidates_host_ptr =
391 static_cast<input_type*>(
392 queue.enqueue_map_buffer(
393 candidates.get_buffer(), command_queue::map_read,
394 0, work_groups_no * sizeof(input_type)
395 )
396 );
397
398 uint_* candidates_idx_host_ptr =
399 static_cast<uint_*>(
400 queue.enqueue_map_buffer(
401 candidates_idx.get_buffer(), command_queue::map_read,
402 0, work_groups_no * sizeof(uint_)
403 )
404 );
405
406 input_type* i = candidates_host_ptr;
407 uint_* idx = candidates_idx_host_ptr;
408 uint_* extremum_idx = idx;
409 input_type extremum = *candidates_host_ptr;
410 i++; idx++;
411
412 // find extremum (serial) from among the candidates on host
413 if(!find_minimum) {
414 while(idx != (candidates_idx_host_ptr + work_groups_no)) {
415 input_type next = *i;
416 bool compare_result = next > extremum;
417 bool equal = next == extremum;
418 extremum = compare_result ? next : extremum;
419 extremum_idx = compare_result ? idx : extremum_idx;
420 extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
421 idx++, i++;
422 }
423 }
424 else {
425 while(idx != (candidates_idx_host_ptr + work_groups_no)) {
426 input_type next = *i;
427 bool compare_result = next < extremum;
428 bool equal = next == extremum;
429 extremum = compare_result ? next : extremum;
430 extremum_idx = compare_result ? idx : extremum_idx;
431 extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
432 idx++, i++;
433 }
434 }
435
436 return first + static_cast<difference_type>(*extremum_idx);
437 }
438
439 } // end detail namespace
440 } // end compute namespace
441 } // end boost namespace
442
443 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP