ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
  13
  14 #include <algorithm>
  15
  16 #include <boost/compute/types.hpp>
  17 #include <boost/compute/command_queue.hpp>
  18 #include <boost/compute/algorithm/copy.hpp>
  19 #include <boost/compute/allocator/pinned_allocator.hpp>
  20 #include <boost/compute/container/vector.hpp>
  21 #include <boost/compute/detail/meta_kernel.hpp>
  22 #include <boost/compute/detail/iterator_range_size.hpp>
  23 #include <boost/compute/detail/parameter_cache.hpp>
  24 #include <boost/compute/memory/local_buffer.hpp>
  25 #include <boost/compute/type_traits/type_name.hpp>
  26 #include <boost/compute/utility/program_cache.hpp>
  27
  28 namespace boost {
  29 namespace compute {
  30 namespace detail {
  31
  32 template<class InputIterator>
  33 bool find_extrema_with_reduce_requirements_met(InputIterator first,
  34                                                InputIterator last,
  35                                                command_queue &queue)
  36 {
  37     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  38
  39     const device &device = queue.get_device();
  40
  41     // device must have dedicated local memory storage
  42     // otherwise reduction would be highly inefficient
  43     if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
  44     {
  45         return false;
  46     }
  47
  48     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  49     // local memory size in bytes (per compute unit)
  50     const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
  51
  52     std::string cache_key = std::string("__boost_find_extrema_reduce_")
  53         + type_name<input_type>();
  54     // load parameters
  55     boost::shared_ptr<parameter_cache> parameters =
  56         detail::parameter_cache::get_global_cache(device);
  57
  58     // Get preferred work group size
  59     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
  60
  61     work_group_size = (std::min)(max_work_group_size, work_group_size);
  62
  63     // local memory size needed to perform parallel reduction
  64     size_t required_local_mem_size = 0;
  65     // indices size
  66     required_local_mem_size += sizeof(uint_) * work_group_size;
  67     // values size
  68     required_local_mem_size += sizeof(input_type) * work_group_size;
  69
  70     // at least 4 work groups per compute unit otherwise reduction
  71     // would be highly inefficient
  72     return ((required_local_mem_size * 4) <= local_mem_size);
  73 }
  74
  75 /// \internal_
  76 /// Algorithm finds the first extremum in given range, i.e., with the lowest
  77 /// index.
  78 ///
  79 /// If \p use_input_idx is false, it's assumed that input data is ordered by
  80 /// increasing index and \p input_idx is not used in the algorithm.
  81 template<class InputIterator, class ResultIterator, class Compare>
  82 inline void find_extrema_with_reduce(InputIterator input,
  83                                      vector<uint_>::iterator input_idx,
  84                                      size_t count,
  85                                      ResultIterator result,
  86                                      vector<uint_>::iterator result_idx,
  87                                      size_t work_groups_no,
  88                                      size_t work_group_size,
  89                                      Compare compare,
  90                                      const bool find_minimum,
  91                                      const bool use_input_idx,
  92                                      command_queue &queue)
  93 {
  94     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  95
  96     const context &context = queue.get_context();
  97
  98     meta_kernel k("find_extrema_reduce");
  99     size_t count_arg = k.add_arg<uint_>("count");
 100     size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
 101     size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
 102
 103     k <<
 104         // Work item global id
 105         k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
 106
 107         // Index of element that will be read from input buffer
 108         k.decl<uint_>("idx") << " = gid;\n" <<
 109
 110         k.decl<input_type>("acc") << ";\n" <<
 111         k.decl<uint_>("acc_idx") << ";\n" <<
 112         "if(gid < count) {\n" <<
 113             // Real index of currently best element
 114             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
 115             k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
 116             "#else\n" <<
 117             k.var<uint_>("acc_idx") << " = idx;\n" <<
 118             "#endif\n" <<
 119
 120             // Init accumulator with first[get_global_id(0)]
 121             "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
 122             "idx += get_global_size(0);\n" <<
 123         "}\n" <<
 124
 125         k.decl<bool>("compare_result") << ";\n" <<
 126         k.decl<bool>("equal") << ";\n\n" <<
 127         "while( idx < count ){\n" <<
 128             // Next element
 129             k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
 130             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
 131             k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
 132             "#endif\n" <<
 133
 134             // Comparison between currently best element (acc) and next element
 135             "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
 136             "compare_result = " << compare(k.var<input_type>("next"),
 137                                            k.var<input_type>("acc")) << ";\n" <<
 138             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
 139             "equal = !compare_result && !" <<
 140                 compare(k.var<input_type>("acc"),
 141                         k.var<input_type>("next")) << ";\n" <<
 142             "# endif\n" <<
 143             "#else\n" <<
 144             "compare_result = " << compare(k.var<input_type>("acc"),
 145                                            k.var<input_type>("next")) << ";\n" <<
 146             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
 147             "equal = !compare_result && !" <<
 148                 compare(k.var<input_type>("next"),
 149                         k.var<input_type>("acc")) << ";\n" <<
 150             "# endif\n" <<
 151             "#endif\n" <<
 152
 153             // save the winner
 154             "acc = compare_result ? acc : next;\n" <<
 155             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
 156             "acc_idx = compare_result ? " <<
 157                 "acc_idx : " <<
 158                 "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
 159             "#else\n" <<
 160             "acc_idx = compare_result ? acc_idx : idx;\n" <<
 161             "#endif\n" <<
 162             "idx += get_global_size(0);\n" <<
 163         "}\n\n" <<
 164
 165         // Work item local id
 166         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
 167         "block[lid] = acc;\n" <<
 168         "block_idx[lid] = acc_idx;\n" <<
 169         "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
 170
 171         k.decl<uint_>("group_offset") <<
 172             " = count - (get_local_size(0) * get_group_id(0));\n\n";
 173
 174     k <<
 175         "#pragma unroll\n"
 176         "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
 177              "offset = offset / 2) {\n" <<
 178              "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
 179                  k.decl<input_type>("mine") << " = block[lid];\n" <<
 180                  k.decl<input_type>("other") << " = block[lid+offset];\n" <<
 181                  "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
 182                  "compare_result = " << compare(k.var<input_type>("other"),
 183                                                 k.var<input_type>("mine")) << ";\n" <<
 184                  "equal = !compare_result && !" <<
 185                      compare(k.var<input_type>("mine"),
 186                              k.var<input_type>("other")) << ";\n" <<
 187                  "#else\n" <<
 188                  "compare_result = " << compare(k.var<input_type>("mine"),
 189                                                 k.var<input_type>("other")) << ";\n" <<
 190                  "equal = !compare_result && !" <<
 191                      compare(k.var<input_type>("other"),
 192                              k.var<input_type>("mine")) << ";\n" <<
 193                  "#endif\n" <<
 194                  "block[lid] = compare_result ? mine : other;\n" <<
 195                  k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
 196                  k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
 197                  "block_idx[lid] = compare_result ? " <<
 198                      "mine_idx : " <<
 199                      "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
 200              "}\n"
 201              "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
 202         "}\n\n" <<
 203
 204          // write block result to global output
 205         "if(lid == 0){\n" <<
 206             result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
 207             result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
 208         "}";
 209
 210     std::string options;
 211     if(!find_minimum){
 212         options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
 213     }
 214     if(use_input_idx){
 215         options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
 216     }
 217
 218     kernel kernel = k.compile(context, options);
 219
 220     kernel.set_arg(count_arg, static_cast<uint_>(count));
 221     kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
 222     kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
 223
 224     queue.enqueue_1d_range_kernel(kernel,
 225                                   0,
 226                                   work_groups_no * work_group_size,
 227                                   work_group_size);
 228 }
 229
 230 template<class InputIterator, class ResultIterator, class Compare>
 231 inline void find_extrema_with_reduce(InputIterator input,
 232                                      size_t count,
 233                                      ResultIterator result,
 234                                      vector<uint_>::iterator result_idx,
 235                                      size_t work_groups_no,
 236                                      size_t work_group_size,
 237                                      Compare compare,
 238                                      const bool find_minimum,
 239                                      command_queue &queue)
 240 {
 241     // dummy will not be used
 242     buffer_iterator<uint_> dummy = result_idx;
 243     return find_extrema_with_reduce(
 244         input, dummy, count, result, result_idx, work_groups_no,
 245         work_group_size, compare, find_minimum, false, queue
 246     );
 247 }
 248
 249 template<class InputIterator, class Compare>
 250 InputIterator find_extrema_with_reduce(InputIterator first,
 251                                        InputIterator last,
 252                                        Compare compare,
 253                                        const bool find_minimum,
 254                                        command_queue &queue)
 255 {
 256     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 257     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 258
 259     const context &context = queue.get_context();
 260     const device &device = queue.get_device();
 261
 262     // Getting information about used queue and device
 263     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
 264     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
 265
 266     const size_t count = detail::iterator_range_size(first, last);
 267
 268     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
 269         + type_name<input_type>();
 270
 271     // load parameters
 272     boost::shared_ptr<parameter_cache> parameters =
 273         detail::parameter_cache::get_global_cache(device);
 274
 275     // get preferred work group size and preferred number
 276     // of work groups per compute unit
 277     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
 278     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
 279
 280     // calculate work group size and number of work groups
 281     work_group_size = (std::min)(max_work_group_size, work_group_size);
 282     size_t work_groups_no = compute_units_no * work_groups_per_cu;
 283     work_groups_no = (std::min)(
 284         work_groups_no,
 285         static_cast<size_t>(std::ceil(float(count) / work_group_size))
 286     );
 287
 288     // phase I: finding candidates for extremum
 289
 290     // device buffors for extremum candidates and their indices
 291     // each work-group computes its candidate
 292     vector<input_type> candidates(work_groups_no, context);
 293     vector<uint_> candidates_idx(work_groups_no, context);
 294
 295     // finding candidates for first extremum and their indices
 296     find_extrema_with_reduce(
 297         first, count, candidates.begin(), candidates_idx.begin(),
 298         work_groups_no, work_group_size, compare, find_minimum, queue
 299     );
 300
 301     // phase II: finding extremum from among the candidates
 302
 303     // zero-copy buffers for final result (value and index)
 304     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
 305         result(1, context);
 306     vector<uint_, ::boost::compute::pinned_allocator<uint_> >
 307         result_idx(1, context);
 308
 309     // get extremum from among the candidates
 310     find_extrema_with_reduce(
 311         candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
 312         result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
 313     );
 314
 315     // mapping extremum index to host
 316     uint_* result_idx_host_ptr =
 317         static_cast<uint_*>(
 318             queue.enqueue_map_buffer(
 319                 result_idx.get_buffer(), command_queue::map_read,
 320                 0, sizeof(uint_)
 321             )
 322         );
 323
 324     return first + static_cast<difference_type>(*result_idx_host_ptr);
 325 }
 326
 327 template<class InputIterator>
 328 InputIterator find_extrema_with_reduce(InputIterator first,
 329                                        InputIterator last,
 330                                        ::boost::compute::less<
 331                                            typename std::iterator_traits<
 332                                                InputIterator
 333                                            >::value_type
 334                                        >
 335                                        compare,
 336                                        const bool find_minimum,
 337                                        command_queue &queue)
 338 {
 339     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 340     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 341
 342     const context &context = queue.get_context();
 343     const device &device = queue.get_device();
 344
 345     // Getting information about used queue and device
 346     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
 347     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
 348
 349     const size_t count = detail::iterator_range_size(first, last);
 350
 351     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
 352         + type_name<input_type>();
 353
 354     // load parameters
 355     boost::shared_ptr<parameter_cache> parameters =
 356         detail::parameter_cache::get_global_cache(device);
 357
 358     // get preferred work group size and preferred number
 359     // of work groups per compute unit
 360     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
 361     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
 362
 363     // calculate work group size and number of work groups
 364     work_group_size = (std::min)(max_work_group_size, work_group_size);
 365     size_t work_groups_no = compute_units_no * work_groups_per_cu;
 366     work_groups_no = (std::min)(
 367         work_groups_no,
 368         static_cast<size_t>(std::ceil(float(count) / work_group_size))
 369     );
 370
 371     // phase I: finding candidates for extremum
 372
 373     // device buffors for extremum candidates and their indices
 374     // each work-group computes its candidate
 375     // zero-copy buffers are used to eliminate copying data back to host
 376     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
 377         candidates(work_groups_no, context);
 378     vector<uint_, ::boost::compute::pinned_allocator <uint_> >
 379         candidates_idx(work_groups_no, context);
 380
 381     // finding candidates for first extremum and their indices
 382     find_extrema_with_reduce(
 383         first, count, candidates.begin(), candidates_idx.begin(),
 384         work_groups_no, work_group_size, compare, find_minimum, queue
 385     );
 386
 387     // phase II: finding extremum from among the candidates
 388
 389     // mapping candidates and their indices to host
 390     input_type* candidates_host_ptr =
 391         static_cast<input_type*>(
 392             queue.enqueue_map_buffer(
 393                 candidates.get_buffer(), command_queue::map_read,
 394                 0, work_groups_no * sizeof(input_type)
 395             )
 396         );
 397
 398     uint_* candidates_idx_host_ptr =
 399         static_cast<uint_*>(
 400             queue.enqueue_map_buffer(
 401                 candidates_idx.get_buffer(), command_queue::map_read,
 402                 0, work_groups_no * sizeof(uint_)
 403             )
 404         );
 405
 406     input_type* i = candidates_host_ptr;
 407     uint_* idx = candidates_idx_host_ptr;
 408     uint_* extremum_idx = idx;
 409     input_type extremum = *candidates_host_ptr;
 410     i++; idx++;
 411
 412     // find extremum (serial) from among the candidates on host
 413     if(!find_minimum) {
 414         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
 415             input_type next = *i;
 416             bool compare_result =  next > extremum;
 417             bool equal = next == extremum;
 418             extremum = compare_result ? next : extremum;
 419             extremum_idx = compare_result ? idx : extremum_idx;
 420             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
 421             idx++, i++;
 422         }
 423     }
 424     else {
 425         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
 426             input_type next = *i;
 427             bool compare_result = next < extremum;
 428             bool equal = next == extremum;
 429             extremum = compare_result ? next : extremum;
 430             extremum_idx = compare_result ? idx : extremum_idx;
 431             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
 432             idx++, i++;
 433         }
 434     }
 435
 436     return first + static_cast<difference_type>(*extremum_idx);
 437 }
 438
 439 } // end detail namespace
 440 } // end compute namespace
 441 } // end boost namespace
 442
 443 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP