]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/copy_on_device.hpp
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / copy_on_device.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
13
14 #include <iterator>
15
16 #include <boost/compute/command_queue.hpp>
17 #include <boost/compute/async/future.hpp>
18 #include <boost/compute/iterator/buffer_iterator.hpp>
19 #include <boost/compute/iterator/discard_iterator.hpp>
20 #include <boost/compute/memory/svm_ptr.hpp>
21 #include <boost/compute/detail/iterator_range_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/detail/parameter_cache.hpp>
24 #include <boost/compute/detail/work_size.hpp>
25 #include <boost/compute/detail/vendor.hpp>
26
27 namespace boost {
28 namespace compute {
29 namespace detail {
30
31 template<class InputIterator, class OutputIterator>
32 inline event copy_on_device_cpu(InputIterator first,
33 OutputIterator result,
34 size_t count,
35 command_queue &queue)
36 {
37 meta_kernel k("copy");
38 const device& device = queue.get_device();
39
40 k <<
41 "uint block = " <<
42 "(uint)ceil(((float)count)/get_global_size(0));\n" <<
43 "uint index = get_global_id(0) * block;\n" <<
44 "uint end = min(count, index + block);\n" <<
45 "while(index < end){\n" <<
46 result[k.var<uint_>("index")] << '=' <<
47 first[k.var<uint_>("index")] << ";\n" <<
48 "index++;\n" <<
49 "}\n";
50
51 k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
52
53 size_t global_work_size = device.compute_units();
54 if(count <= 1024) global_work_size = 1;
55 return k.exec_1d(queue, 0, global_work_size);
56 }
57
58 template<class InputIterator, class OutputIterator>
59 inline event copy_on_device_gpu(InputIterator first,
60 OutputIterator result,
61 size_t count,
62 command_queue &queue)
63 {
64 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
65
66 const device& device = queue.get_device();
67 boost::shared_ptr<parameter_cache> parameters =
68 detail::parameter_cache::get_global_cache(device);
69 std::string cache_key =
70 "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
71
72 uint_ vpt = parameters->get(cache_key, "vpt", 4);
73 uint_ tpb = parameters->get(cache_key, "tpb", 128);
74
75 meta_kernel k("copy");
76 k <<
77 "uint index = get_local_id(0) + " <<
78 "(" << vpt * tpb << " * get_group_id(0));\n" <<
79 "for(uint i = 0; i < " << vpt << "; i++){\n" <<
80 " if(index < count){\n" <<
81 result[k.var<uint_>("index")] << '=' <<
82 first[k.var<uint_>("index")] << ";\n" <<
83 " index += " << tpb << ";\n"
84 " }\n"
85 "}\n";
86
87 k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
88 size_t global_work_size = calculate_work_size(count, vpt, tpb);
89 return k.exec_1d(queue, 0, global_work_size, tpb);
90 }
91
92 template<class InputIterator, class OutputIterator>
93 inline event dispatch_copy_on_device(InputIterator first,
94 InputIterator last,
95 OutputIterator result,
96 command_queue &queue)
97 {
98 const size_t count = detail::iterator_range_size(first, last);
99
100 if(count == 0){
101 // nothing to do
102 return event();
103 }
104
105 const device& device = queue.get_device();
106 // copy_on_device_cpu() does not work for CPU on Apple platform
107 // due to bug in its compiler.
108 // See https://github.com/boostorg/compute/pull/626
109 if((device.type() & device::cpu) && !is_apple_platform_device(device))
110 {
111 return copy_on_device_cpu(first, result, count, queue);
112 }
113 return copy_on_device_gpu(first, result, count, queue);
114 }
115
116 template<class InputIterator, class OutputIterator>
117 inline OutputIterator copy_on_device(InputIterator first,
118 InputIterator last,
119 OutputIterator result,
120 command_queue &queue)
121 {
122 dispatch_copy_on_device(first, last, result, queue);
123 return result + std::distance(first, last);
124 }
125
126 template<class InputIterator>
127 inline discard_iterator copy_on_device(InputIterator first,
128 InputIterator last,
129 discard_iterator result,
130 command_queue &queue)
131 {
132 (void) queue;
133
134 return result + std::distance(first, last);
135 }
136
137 template<class InputIterator, class OutputIterator>
138 inline future<OutputIterator> copy_on_device_async(InputIterator first,
139 InputIterator last,
140 OutputIterator result,
141 command_queue &queue)
142 {
143 event event_ = dispatch_copy_on_device(first, last, result, queue);
144 return make_future(result + std::distance(first, last), event_);
145 }
146
147 #ifdef CL_VERSION_2_0
148 // copy_on_device() specialization for svm_ptr
149 template<class T>
150 inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
151 svm_ptr<T> last,
152 svm_ptr<T> result,
153 command_queue &queue)
154 {
155 size_t count = iterator_range_size(first, last);
156 if(count == 0){
157 return result;
158 }
159
160 queue.enqueue_svm_memcpy(
161 result.get(), first.get(), count * sizeof(T)
162 );
163
164 return result + count;
165 }
166
167 template<class T>
168 inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
169 svm_ptr<T> last,
170 svm_ptr<T> result,
171 command_queue &queue)
172 {
173 size_t count = iterator_range_size(first, last);
174 if(count == 0){
175 return future<svm_ptr<T> >();
176 }
177
178 event event_ = queue.enqueue_svm_memcpy_async(
179 result.get(), first.get(), count * sizeof(T)
180 );
181
182 return make_future(result + count, event_);
183 }
184 #endif // CL_VERSION_2_0
185
186 } // end detail namespace
187 } // end compute namespace
188 } // end boost namespace
189
190 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP