ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/count_if_with_threads.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
  13
  14 #include <numeric>
  15
  16 #include <boost/compute/detail/meta_kernel.hpp>
  17 #include <boost/compute/container/vector.hpp>
  18
  19 namespace boost {
  20 namespace compute {
  21 namespace detail {
  22
  23 template<class InputIterator, class Predicate>
  24 class count_if_with_threads_kernel : meta_kernel
  25 {
  26 public:
  27     typedef typename
  28         std::iterator_traits<InputIterator>::value_type
  29         value_type;
  30
  31     count_if_with_threads_kernel()
  32         : meta_kernel("count_if_with_threads")
  33     {
  34     }
  35
  36     void set_args(InputIterator first,
  37                   InputIterator last,
  38                   Predicate predicate)
  39
  40     {
  41         typedef typename std::iterator_traits<InputIterator>::value_type T;
  42
  43         m_size = detail::iterator_range_size(first, last);
  44
  45         m_size_arg = add_arg<const ulong_>("size");
  46         m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
  47
  48         *this <<
  49             // thread parameters
  50             "const uint gid = get_global_id(0);\n" <<
  51             "const uint block_size = size / get_global_size(0);\n" <<
  52             "const uint start = block_size * gid;\n" <<
  53             "uint end = 0;\n" <<
  54             "if(gid == get_global_size(0) - 1)\n" <<
  55             "    end = size;\n" <<
  56             "else\n" <<
  57             "    end = block_size * gid + block_size;\n" <<
  58
  59             // count values
  60             "uint count = 0;\n" <<
  61             "for(uint i = start; i < end; i++){\n" <<
  62                 decl<const T>("value") << "="
  63                     << first[expr<uint_>("i")] << ";\n" <<
  64                 if_(predicate(var<const T>("value"))) << "{\n" <<
  65                     "count++;\n" <<
  66                 "}\n" <<
  67             "}\n" <<
  68
  69             // write count
  70             "counts[gid] = count;\n";
  71     }
  72
  73     size_t exec(command_queue &queue)
  74     {
  75         const device &device = queue.get_device();
  76         const context &context = queue.get_context();
  77
  78         size_t threads = device.compute_units();
  79
  80         const size_t minimum_block_size = 2048;
  81         if(m_size / threads < minimum_block_size){
  82             threads = static_cast<size_t>(
  83                           (std::max)(
  84                               std::ceil(float(m_size) / minimum_block_size),
  85                               1.0f
  86                           )
  87                       );
  88         }
  89
  90         // storage for counts
  91         ::boost::compute::vector<ulong_> counts(threads, context);
  92
  93         // exec kernel
  94         set_arg(m_size_arg, static_cast<ulong_>(m_size));
  95         set_arg(m_counts_arg, counts.get_buffer());
  96         exec_1d(queue, 0, threads, 1);
  97
  98         // copy counts to the host
  99         std::vector<ulong_> host_counts(threads);
 100         ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
 101
 102         // return sum of counts
 103         return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
 104     }
 105
 106 private:
 107     size_t m_size;
 108     size_t m_size_arg;
 109     size_t m_counts_arg;
 110 };
 111
 112 // counts values that match the predicate using one thread per block. this is
 113 // optimized for cpu-type devices with a small number of compute units.
 114 template<class InputIterator, class Predicate>
 115 inline size_t count_if_with_threads(InputIterator first,
 116                                     InputIterator last,
 117                                     Predicate predicate,
 118                                     command_queue &queue)
 119 {
 120     count_if_with_threads_kernel<InputIterator, Predicate> kernel;
 121     kernel.set_args(first, last, predicate);
 122     return kernel.exec(queue);
 123 }
 124
 125 } // end detail namespace
 126 } // end compute namespace
 127 } // end boost namespace
 128
 129 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP