ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
  13
  14 #include <iterator>
  15
  16 #include <boost/compute/types.hpp>
  17 #include <boost/compute/functional.hpp>
  18 #include <boost/compute/command_queue.hpp>
  19 #include <boost/compute/container/detail/scalar.hpp>
  20 #include <boost/compute/iterator/buffer_iterator.hpp>
  21 #include <boost/compute/type_traits/type_name.hpp>
  22 #include <boost/compute/detail/meta_kernel.hpp>
  23 #include <boost/compute/detail/iterator_range_size.hpp>
  24 #include <boost/compute/detail/parameter_cache.hpp>
  25
  26 namespace boost {
  27 namespace compute {
  28 namespace detail {
  29
  30 template<class InputIterator, class UnaryPredicate>
  31 inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
  32                                                   InputIterator last,
  33                                                   UnaryPredicate predicate,
  34                                                   const size_t count,
  35                                                   command_queue &queue)
  36 {
  37     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  38     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  39
  40     const context &context = queue.get_context();
  41
  42     detail::meta_kernel k("find_if");
  43     size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
  44     atomic_min<uint_> atomic_min_uint;
  45
  46     k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
  47       << k.decl<const value_type>("value") << "="
  48       <<     first[k.var<const uint_>("i")] << ";\n"
  49       << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
  50       << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
  51       << "}\n";
  52
  53     kernel kernel = k.compile(context);
  54
  55     scalar<uint_> index(context);
  56     kernel.set_arg(index_arg, index.get_buffer());
  57
  58     // initialize index to the last iterator's index
  59     index.write(static_cast<uint_>(count), queue);
  60     queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
  61
  62     // read index and return iterator
  63     return first + static_cast<difference_type>(index.read(queue));
  64 }
  65
  66 template<class InputIterator, class UnaryPredicate>
  67 inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
  68                                                        InputIterator last,
  69                                                        UnaryPredicate predicate,
  70                                                        const size_t count,
  71                                                        const size_t vpt,
  72                                                        command_queue &queue)
  73 {
  74     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  75     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  76
  77     const context &context = queue.get_context();
  78     const device &device = queue.get_device();
  79
  80     detail::meta_kernel k("find_if");
  81     size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
  82     size_t count_arg = k.add_arg<const uint_>("count");
  83     size_t vpt_arg = k.add_arg<const uint_>("vpt");
  84     atomic_min<uint_> atomic_min_uint;
  85
  86     // for GPUs reads from global memory are coalesced
  87     if(device.type() & device::gpu) {
  88         k <<
  89             k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
  90             k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
  91             k.decl<const uint_>("end") << " = min(" <<
  92                     "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
  93                     "count" <<
  94             ");\n" <<
  95
  96             // checking if the index is already found
  97             "__local uint local_index;\n" <<
  98             "if(get_local_id(0) == 0){\n" <<
  99             "    local_index = *index;\n " <<
 100             "};\n" <<
 101             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
 102             "if(local_index < id){\n" <<
 103             "    return;\n" <<
 104             "}\n" <<
 105
 106             "while(id < end){\n" <<
 107             "    " << k.decl<const value_type>("value") << " = " <<
 108                       first[k.var<const uint_>("id")] << ";\n"
 109             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
 110             "        " << atomic_min_uint(k.var<uint_ *>("index"),
 111                                           k.var<uint_>("id")) << ";\n" <<
 112             "        return;\n"
 113             "    }\n" <<
 114             "    id+=lsize;\n" <<
 115             "}\n";
 116     // for CPUs (and other devices) reads are ordered so the big cache is
 117     // efficiently used.
 118     } else {
 119         k <<
 120             k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
 121             k.decl<const uint_>("end") << " = min(" <<
 122                     "id + " << k.var<uint_>("vpt") << "," <<
 123                     "count" <<
 124             ");\n" <<
 125             "while(id < end && (*index) > id){\n" <<
 126             "    " << k.decl<const value_type>("value") << " = " <<
 127                       first[k.var<const uint_>("id")] << ";\n"
 128             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
 129             "        " << atomic_min_uint(k.var<uint_ *>("index"),
 130                                           k.var<uint_>("id")) << ";\n" <<
 131             "        return;\n" <<
 132             "    }\n" <<
 133             "    id++;\n" <<
 134             "}\n";
 135     }
 136
 137     kernel kernel = k.compile(context);
 138
 139     scalar<uint_> index(context);
 140     kernel.set_arg(index_arg, index.get_buffer());
 141     kernel.set_arg(count_arg, static_cast<uint_>(count));
 142     kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
 143
 144     // initialize index to the last iterator's index
 145     index.write(static_cast<uint_>(count), queue);
 146
 147     const size_t global_wg_size = static_cast<size_t>(
 148         std::ceil(float(count) / vpt)
 149     );
 150     queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
 151
 152     // read index and return iterator
 153     return first + static_cast<difference_type>(index.read(queue));
 154 }
 155
 156 template<class InputIterator, class UnaryPredicate>
 157 inline InputIterator find_if_with_atomics(InputIterator first,
 158                                           InputIterator last,
 159                                           UnaryPredicate predicate,
 160                                           command_queue &queue)
 161 {
 162     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
 163
 164     size_t count = detail::iterator_range_size(first, last);
 165     if(count == 0){
 166         return last;
 167     }
 168
 169     const device &device = queue.get_device();
 170
 171     // load cached parameters
 172     std::string cache_key = std::string("__boost_find_if_with_atomics_")
 173         + type_name<value_type>();
 174     boost::shared_ptr<parameter_cache> parameters =
 175         detail::parameter_cache::get_global_cache(device);
 176
 177     // for relatively small inputs on GPUs kernel checking one value per thread
 178     // (work-item) is more efficient than its multiple values per thread version
 179     if(device.type() & device::gpu){
 180         const size_t one_vpt_threshold =
 181             parameters->get(cache_key, "one_vpt_threshold", 1048576);
 182         if(count <= one_vpt_threshold){
 183             return find_if_with_atomics_one_vpt(
 184                 first, last, predicate, count, queue
 185             );
 186         }
 187     }
 188
 189     // values per thread
 190     size_t vpt;
 191     if(device.type() & device::gpu){
 192         // get vpt parameter
 193         vpt = parameters->get(cache_key, "vpt", 32);
 194     } else {
 195         // for CPUs work is split equally between compute units
 196         const size_t max_compute_units =
 197             device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
 198         vpt = static_cast<size_t>(
 199             std::ceil(float(count) / max_compute_units)
 200         );
 201     }
 202
 203     return find_if_with_atomics_multiple_vpt(
 204         first, last, predicate, count, vpt, queue
 205     );
 206 }
 207
 208 } // end detail namespace
 209 } // end compute namespace
 210 } // end boost namespace
 211
 212 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP