ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/copy_on_device.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
  13
  14 #include <iterator>
  15
  16 #include <boost/compute/command_queue.hpp>
  17 #include <boost/compute/async/future.hpp>
  18 #include <boost/compute/iterator/buffer_iterator.hpp>
  19 #include <boost/compute/iterator/discard_iterator.hpp>
  20 #include <boost/compute/memory/svm_ptr.hpp>
  21 #include <boost/compute/detail/iterator_range_size.hpp>
  22 #include <boost/compute/detail/meta_kernel.hpp>
  23 #include <boost/compute/detail/parameter_cache.hpp>
  24 #include <boost/compute/detail/work_size.hpp>
  25 #include <boost/compute/detail/vendor.hpp>
  26
  27 namespace boost {
  28 namespace compute {
  29 namespace detail {
  30
  31 template<class InputIterator, class OutputIterator>
  32 inline event copy_on_device_cpu(InputIterator first,
  33                                 OutputIterator result,
  34                                 size_t count,
  35                                 command_queue &queue)
  36 {
  37     meta_kernel k("copy");
  38     const device& device = queue.get_device();
  39
  40     k <<
  41         "uint block = " <<
  42             "(uint)ceil(((float)count)/get_global_size(0));\n" <<
  43         "uint index = get_global_id(0) * block;\n" <<
  44         "uint end = min(count, index + block);\n" <<
  45         "while(index < end){\n" <<
  46             result[k.var<uint_>("index")] << '=' <<
  47                 first[k.var<uint_>("index")] << ";\n" <<
  48             "index++;\n" <<
  49         "}\n";
  50
  51     k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
  52
  53     size_t global_work_size = device.compute_units();
  54     if(count <= 1024) global_work_size = 1;
  55     return k.exec_1d(queue, 0, global_work_size);
  56 }
  57
  58 template<class InputIterator, class OutputIterator>
  59 inline event copy_on_device_gpu(InputIterator first,
  60                                 OutputIterator result,
  61                                 size_t count,
  62                                 command_queue &queue)
  63 {
  64     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  65
  66     const device& device = queue.get_device();
  67     boost::shared_ptr<parameter_cache> parameters =
  68         detail::parameter_cache::get_global_cache(device);
  69     std::string cache_key =
  70         "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
  71
  72     uint_ vpt = parameters->get(cache_key, "vpt", 4);
  73     uint_ tpb = parameters->get(cache_key, "tpb", 128);
  74
  75     meta_kernel k("copy");
  76     k <<
  77         "uint index = get_local_id(0) + " <<
  78             "(" << vpt * tpb << " * get_group_id(0));\n" <<
  79         "for(uint i = 0; i < " << vpt << "; i++){\n" <<
  80         "    if(index < count){\n" <<
  81                 result[k.var<uint_>("index")] << '=' <<
  82                     first[k.var<uint_>("index")] << ";\n" <<
  83         "       index += " << tpb << ";\n"
  84         "    }\n"
  85         "}\n";
  86
  87     k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
  88     size_t global_work_size = calculate_work_size(count, vpt, tpb);
  89     return k.exec_1d(queue, 0, global_work_size, tpb);
  90 }
  91
  92 template<class InputIterator, class OutputIterator>
  93 inline event dispatch_copy_on_device(InputIterator first,
  94                                      InputIterator last,
  95                                      OutputIterator result,
  96                                      command_queue &queue)
  97 {
  98     const size_t count = detail::iterator_range_size(first, last);
  99
 100     if(count == 0){
 101         // nothing to do
 102         return event();
 103     }
 104
 105     const device& device = queue.get_device();
 106     // copy_on_device_cpu() does not work for CPU on Apple platform
 107     // due to bug in its compiler.
 108     // See https://github.com/boostorg/compute/pull/626
 109     if((device.type() & device::cpu) && !is_apple_platform_device(device))
 110     {
 111         return copy_on_device_cpu(first, result, count, queue);
 112     }
 113     return copy_on_device_gpu(first, result, count, queue);
 114 }
 115
 116 template<class InputIterator, class OutputIterator>
 117 inline OutputIterator copy_on_device(InputIterator first,
 118                                      InputIterator last,
 119                                      OutputIterator result,
 120                                      command_queue &queue)
 121 {
 122     dispatch_copy_on_device(first, last, result, queue);
 123     return result + std::distance(first, last);
 124 }
 125
 126 template<class InputIterator>
 127 inline discard_iterator copy_on_device(InputIterator first,
 128                                        InputIterator last,
 129                                        discard_iterator result,
 130                                        command_queue &queue)
 131 {
 132     (void) queue;
 133
 134     return result + std::distance(first, last);
 135 }
 136
 137 template<class InputIterator, class OutputIterator>
 138 inline future<OutputIterator> copy_on_device_async(InputIterator first,
 139                                                    InputIterator last,
 140                                                    OutputIterator result,
 141                                                    command_queue &queue)
 142 {
 143     event event_ = dispatch_copy_on_device(first, last, result, queue);
 144     return make_future(result + std::distance(first, last), event_);
 145 }
 146
 147 #ifdef CL_VERSION_2_0
 148 // copy_on_device() specialization for svm_ptr
 149 template<class T>
 150 inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
 151                                  svm_ptr<T> last,
 152                                  svm_ptr<T> result,
 153                                  command_queue &queue)
 154 {
 155     size_t count = iterator_range_size(first, last);
 156     if(count == 0){
 157         return result;
 158     }
 159
 160     queue.enqueue_svm_memcpy(
 161         result.get(), first.get(), count * sizeof(T)
 162     );
 163
 164     return result + count;
 165 }
 166
 167 template<class T>
 168 inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
 169                                                 svm_ptr<T> last,
 170                                                 svm_ptr<T> result,
 171                                                 command_queue &queue)
 172 {
 173     size_t count = iterator_range_size(first, last);
 174     if(count == 0){
 175         return future<svm_ptr<T> >();
 176     }
 177
 178     event event_ = queue.enqueue_svm_memcpy_async(
 179         result.get(), first.get(), count * sizeof(T)
 180     );
 181
 182     return make_future(result + count, event_);
 183 }
 184 #endif // CL_VERSION_2_0
 185
 186 } // end detail namespace
 187 } // end compute namespace
 188 } // end boost namespace
 189
 190 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP