ceph/src/boost/libs/compute/include/boost/compute/algorithm/reduce.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
  13
  14 #include <iterator>
  15
  16 #include <boost/compute/system.hpp>
  17 #include <boost/compute/functional.hpp>
  18 #include <boost/compute/detail/meta_kernel.hpp>
  19 #include <boost/compute/command_queue.hpp>
  20 #include <boost/compute/container/array.hpp>
  21 #include <boost/compute/container/vector.hpp>
  22 #include <boost/compute/algorithm/copy_n.hpp>
  23 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
  24 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
  25 #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
  26 #include <boost/compute/detail/iterator_range_size.hpp>
  27 #include <boost/compute/memory/local_buffer.hpp>
  28 #include <boost/compute/type_traits/result_of.hpp>
  29
  30 namespace boost {
  31 namespace compute {
  32 namespace detail {
  33
  34 template<class InputIterator, class OutputIterator, class BinaryFunction>
  35 size_t reduce(InputIterator first,
  36               size_t count,
  37               OutputIterator result,
  38               size_t block_size,
  39               BinaryFunction function,
  40               command_queue &queue)
  41 {
  42     typedef typename
  43         std::iterator_traits<InputIterator>::value_type
  44         input_type;
  45     typedef typename
  46         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
  47         result_type;
  48
  49     const context &context = queue.get_context();
  50     size_t block_count = count / 2 / block_size;
  51     size_t total_block_count =
  52         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
  53
  54     if(block_count != 0){
  55         meta_kernel k("block_reduce");
  56         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
  57         size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
  58
  59         k <<
  60             "const uint gid = get_global_id(0);\n" <<
  61             "const uint lid = get_local_id(0);\n" <<
  62
  63             // copy values to local memory
  64             "block[lid] = " <<
  65                 function(first[k.make_var<uint_>("gid*2+0")],
  66                          first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
  67
  68             // perform reduction
  69             "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
  70             "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  71             "    uint mask = (i << 1) - 1;\n" <<
  72             "    if((lid & mask) == 0){\n" <<
  73             "        block[lid] = " <<
  74                          function(k.expr<input_type>("block[lid]"),
  75                                   k.expr<input_type>("block[lid+i]")) << ";\n" <<
  76             "    }\n" <<
  77             "}\n" <<
  78
  79             // write block result to global output
  80             "if(lid == 0)\n" <<
  81             "    output[get_group_id(0)] = block[0];\n";
  82
  83         kernel kernel = k.compile(context);
  84         kernel.set_arg(output_arg, result.get_buffer());
  85         kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
  86
  87         queue.enqueue_1d_range_kernel(kernel,
  88                                       0,
  89                                       block_count * block_size,
  90                                       block_size);
  91     }
  92
  93     // serially reduce any leftovers
  94     if(block_count * block_size * 2 < count){
  95         size_t last_block_start = block_count * block_size * 2;
  96
  97         meta_kernel k("extra_serial_reduce");
  98         size_t count_arg = k.add_arg<uint_>("count");
  99         size_t offset_arg = k.add_arg<uint_>("offset");
 100         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
 101         size_t output_offset_arg = k.add_arg<uint_>("output_offset");
 102
 103         k <<
 104             k.decl<result_type>("result") << " = \n" <<
 105                 first[k.expr<uint_>("offset")] << ";\n" <<
 106             "for(uint i = offset + 1; i < count; i++)\n" <<
 107             "    result = " <<
 108                      function(k.var<result_type>("result"),
 109                               first[k.var<uint_>("i")]) << ";\n" <<
 110             "output[output_offset] = result;\n";
 111
 112         kernel kernel = k.compile(context);
 113         kernel.set_arg(count_arg, static_cast<uint_>(count));
 114         kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
 115         kernel.set_arg(output_arg, result.get_buffer());
 116         kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
 117
 118         queue.enqueue_task(kernel);
 119     }
 120
 121     return total_block_count;
 122 }
 123
 124 template<class InputIterator, class BinaryFunction>
 125 inline vector<
 126     typename boost::compute::result_of<
 127         BinaryFunction(
 128             typename std::iterator_traits<InputIterator>::value_type,
 129             typename std::iterator_traits<InputIterator>::value_type
 130         )
 131     >::type
 132 >
 133 block_reduce(InputIterator first,
 134              size_t count,
 135              size_t block_size,
 136              BinaryFunction function,
 137              command_queue &queue)
 138 {
 139     typedef typename
 140         std::iterator_traits<InputIterator>::value_type
 141         input_type;
 142     typedef typename
 143         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
 144         result_type;
 145
 146     const context &context = queue.get_context();
 147     size_t total_block_count =
 148         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
 149     vector<result_type> result_vector(total_block_count, context);
 150
 151     reduce(first, count, result_vector.begin(), block_size, function, queue);
 152
 153     return result_vector;
 154 }
 155
 156 template<class InputIterator, class OutputIterator, class BinaryFunction>
 157 inline void generic_reduce(InputIterator first,
 158                            InputIterator last,
 159                            OutputIterator result,
 160                            BinaryFunction function,
 161                            command_queue &queue)
 162 {
 163     typedef typename
 164         std::iterator_traits<InputIterator>::value_type
 165         input_type;
 166     typedef typename
 167         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
 168         result_type;
 169
 170     const device &device = queue.get_device();
 171     const context &context = queue.get_context();
 172
 173     size_t count = detail::iterator_range_size(first, last);
 174
 175     if(device.type() & device::cpu){
 176         array<result_type, 1> value(context);
 177         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
 178         boost::compute::copy_n(value.begin(), 1, result, queue);
 179     }
 180     else {
 181         size_t block_size = 256;
 182
 183         // first pass
 184         vector<result_type> results = detail::block_reduce(first,
 185                                                            count,
 186                                                            block_size,
 187                                                            function,
 188                                                            queue);
 189
 190         if(results.size() > 1){
 191             detail::inplace_reduce(results.begin(),
 192                                    results.end(),
 193                                    function,
 194                                    queue);
 195         }
 196
 197         boost::compute::copy_n(results.begin(), 1, result, queue);
 198     }
 199 }
 200
 201 template<class InputIterator, class OutputIterator, class T>
 202 inline void dispatch_reduce(InputIterator first,
 203                             InputIterator last,
 204                             OutputIterator result,
 205                             const plus<T> &function,
 206                             command_queue &queue)
 207 {
 208     const context &context = queue.get_context();
 209     const device &device = queue.get_device();
 210
 211     // reduce to temporary buffer on device
 212     array<T, 1> value(context);
 213     if(device.type() & device::cpu){
 214         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
 215     }
 216     else {
 217         reduce_on_gpu(first, last, value.begin(), function, queue);
 218     }
 219
 220     // copy to result iterator
 221     copy_n(value.begin(), 1, result, queue);
 222 }
 223
 224 template<class InputIterator, class OutputIterator, class BinaryFunction>
 225 inline void dispatch_reduce(InputIterator first,
 226                             InputIterator last,
 227                             OutputIterator result,
 228                             BinaryFunction function,
 229                             command_queue &queue)
 230 {
 231     generic_reduce(first, last, result, function, queue);
 232 }
 233
 234 } // end detail namespace
 235
 236 /// Returns the result of applying \p function to the elements in the
 237 /// range [\p first, \p last).
 238 ///
 239 /// If no function is specified, \c plus will be used.
 240 ///
 241 /// \param first first element in the input range
 242 /// \param last last element in the input range
 243 /// \param result iterator pointing to the output
 244 /// \param function binary reduction function
 245 /// \param queue command queue to perform the operation
 246 ///
 247 /// The \c reduce() algorithm assumes that the binary reduction function is
 248 /// associative. When used with non-associative functions the result may
 249 /// be non-deterministic and vary in precision. Notably this affects the
 250 /// \c plus<float>() function as floating-point addition is not associative
 251 /// and may produce slightly different results than a serial algorithm.
 252 ///
 253 /// This algorithm supports both host and device iterators for the
 254 /// result argument. This allows for values to be reduced and copied
 255 /// to the host all with a single function call.
 256 ///
 257 /// For example, to calculate the sum of the values in a device vector and
 258 /// copy the result to a value on the host:
 259 ///
 260 /// \snippet test/test_reduce.cpp sum_int
 261 ///
 262 /// Note that while the the \c reduce() algorithm is conceptually identical to
 263 /// the \c accumulate() algorithm, its implementation is substantially more
 264 /// efficient on parallel hardware. For more information, see the documentation
 265 /// on the \c accumulate() algorithm.
 266 ///
 267 /// \see accumulate()
 268 template<class InputIterator, class OutputIterator, class BinaryFunction>
 269 inline void reduce(InputIterator first,
 270                    InputIterator last,
 271                    OutputIterator result,
 272                    BinaryFunction function,
 273                    command_queue &queue = system::default_queue())
 274 {
 275     if(first == last){
 276         return;
 277     }
 278
 279     detail::dispatch_reduce(first, last, result, function, queue);
 280 }
 281
 282 /// \overload
 283 template<class InputIterator, class OutputIterator>
 284 inline void reduce(InputIterator first,
 285                    InputIterator last,
 286                    OutputIterator result,
 287                    command_queue &queue = system::default_queue())
 288 {
 289     typedef typename std::iterator_traits<InputIterator>::value_type T;
 290
 291     if(first == last){
 292         return;
 293     }
 294
 295     detail::dispatch_reduce(first, last, result, plus<T>(), queue);
 296 }
 297
 298 } // end compute namespace
 299 } // end boost namespace
 300
 301 #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP