]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP | |
13 | ||
14 | #include <iterator> | |
15 | ||
16 | #include <boost/compute/command_queue.hpp> | |
17 | #include <boost/compute/async/future.hpp> | |
18 | #include <boost/compute/iterator/buffer_iterator.hpp> | |
19 | #include <boost/compute/iterator/discard_iterator.hpp> | |
20 | #include <boost/compute/memory/svm_ptr.hpp> | |
21 | #include <boost/compute/detail/iterator_range_size.hpp> | |
22 | #include <boost/compute/detail/meta_kernel.hpp> | |
23 | #include <boost/compute/detail/parameter_cache.hpp> | |
24 | #include <boost/compute/detail/work_size.hpp> | |
25 | #include <boost/compute/detail/vendor.hpp> | |
26 | ||
27 | namespace boost { | |
28 | namespace compute { | |
29 | namespace detail { | |
30 | ||
31 | template<class InputIterator, class OutputIterator> | |
32 | inline event copy_on_device_cpu(InputIterator first, | |
33 | OutputIterator result, | |
34 | size_t count, | |
35 | command_queue &queue) | |
36 | { | |
37 | meta_kernel k("copy"); | |
38 | const device& device = queue.get_device(); | |
39 | ||
40 | k << | |
41 | "uint block = " << | |
42 | "(uint)ceil(((float)count)/get_global_size(0));\n" << | |
43 | "uint index = get_global_id(0) * block;\n" << | |
44 | "uint end = min(count, index + block);\n" << | |
45 | "while(index < end){\n" << | |
46 | result[k.var<uint_>("index")] << '=' << | |
47 | first[k.var<uint_>("index")] << ";\n" << | |
48 | "index++;\n" << | |
49 | "}\n"; | |
50 | ||
51 | k.add_set_arg<const uint_>("count", static_cast<uint_>(count)); | |
52 | ||
53 | size_t global_work_size = device.compute_units(); | |
54 | if(count <= 1024) global_work_size = 1; | |
55 | return k.exec_1d(queue, 0, global_work_size); | |
56 | } | |
57 | ||
58 | template<class InputIterator, class OutputIterator> | |
59 | inline event copy_on_device_gpu(InputIterator first, | |
60 | OutputIterator result, | |
61 | size_t count, | |
62 | command_queue &queue) | |
63 | { | |
64 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
65 | ||
66 | const device& device = queue.get_device(); | |
67 | boost::shared_ptr<parameter_cache> parameters = | |
68 | detail::parameter_cache::get_global_cache(device); | |
69 | std::string cache_key = | |
70 | "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type)); | |
71 | ||
72 | uint_ vpt = parameters->get(cache_key, "vpt", 4); | |
73 | uint_ tpb = parameters->get(cache_key, "tpb", 128); | |
74 | ||
75 | meta_kernel k("copy"); | |
76 | k << | |
77 | "uint index = get_local_id(0) + " << | |
78 | "(" << vpt * tpb << " * get_group_id(0));\n" << | |
79 | "for(uint i = 0; i < " << vpt << "; i++){\n" << | |
80 | " if(index < count){\n" << | |
81 | result[k.var<uint_>("index")] << '=' << | |
82 | first[k.var<uint_>("index")] << ";\n" << | |
83 | " index += " << tpb << ";\n" | |
84 | " }\n" | |
85 | "}\n"; | |
86 | ||
87 | k.add_set_arg<const uint_>("count", static_cast<uint_>(count)); | |
88 | size_t global_work_size = calculate_work_size(count, vpt, tpb); | |
89 | return k.exec_1d(queue, 0, global_work_size, tpb); | |
90 | } | |
91 | ||
92 | template<class InputIterator, class OutputIterator> | |
93 | inline event dispatch_copy_on_device(InputIterator first, | |
94 | InputIterator last, | |
95 | OutputIterator result, | |
96 | command_queue &queue) | |
97 | { | |
98 | const size_t count = detail::iterator_range_size(first, last); | |
99 | ||
100 | if(count == 0){ | |
101 | // nothing to do | |
102 | return event(); | |
103 | } | |
104 | ||
105 | const device& device = queue.get_device(); | |
106 | // copy_on_device_cpu() does not work for CPU on Apple platform | |
107 | // due to bug in its compiler. | |
108 | // See https://github.com/boostorg/compute/pull/626 | |
109 | if((device.type() & device::cpu) && !is_apple_platform_device(device)) | |
110 | { | |
111 | return copy_on_device_cpu(first, result, count, queue); | |
112 | } | |
113 | return copy_on_device_gpu(first, result, count, queue); | |
114 | } | |
115 | ||
116 | template<class InputIterator, class OutputIterator> | |
117 | inline OutputIterator copy_on_device(InputIterator first, | |
118 | InputIterator last, | |
119 | OutputIterator result, | |
120 | command_queue &queue) | |
121 | { | |
122 | dispatch_copy_on_device(first, last, result, queue); | |
123 | return result + std::distance(first, last); | |
124 | } | |
125 | ||
126 | template<class InputIterator> | |
127 | inline discard_iterator copy_on_device(InputIterator first, | |
128 | InputIterator last, | |
129 | discard_iterator result, | |
130 | command_queue &queue) | |
131 | { | |
132 | (void) queue; | |
133 | ||
134 | return result + std::distance(first, last); | |
135 | } | |
136 | ||
137 | template<class InputIterator, class OutputIterator> | |
138 | inline future<OutputIterator> copy_on_device_async(InputIterator first, | |
139 | InputIterator last, | |
140 | OutputIterator result, | |
141 | command_queue &queue) | |
142 | { | |
143 | event event_ = dispatch_copy_on_device(first, last, result, queue); | |
144 | return make_future(result + std::distance(first, last), event_); | |
145 | } | |
146 | ||
147 | #ifdef CL_VERSION_2_0 | |
148 | // copy_on_device() specialization for svm_ptr | |
149 | template<class T> | |
150 | inline svm_ptr<T> copy_on_device(svm_ptr<T> first, | |
151 | svm_ptr<T> last, | |
152 | svm_ptr<T> result, | |
153 | command_queue &queue) | |
154 | { | |
155 | size_t count = iterator_range_size(first, last); | |
156 | if(count == 0){ | |
157 | return result; | |
158 | } | |
159 | ||
160 | queue.enqueue_svm_memcpy( | |
161 | result.get(), first.get(), count * sizeof(T) | |
162 | ); | |
163 | ||
164 | return result + count; | |
165 | } | |
166 | ||
167 | template<class T> | |
168 | inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first, | |
169 | svm_ptr<T> last, | |
170 | svm_ptr<T> result, | |
171 | command_queue &queue) | |
172 | { | |
173 | size_t count = iterator_range_size(first, last); | |
174 | if(count == 0){ | |
175 | return future<svm_ptr<T> >(); | |
176 | } | |
177 | ||
178 | event event_ = queue.enqueue_svm_memcpy_async( | |
179 | result.get(), first.get(), count * sizeof(T) | |
180 | ); | |
181 | ||
182 | return make_future(result + count, event_); | |
183 | } | |
184 | #endif // CL_VERSION_2_0 | |
185 | ||
186 | } // end detail namespace | |
187 | } // end compute namespace | |
188 | } // end boost namespace | |
189 | ||
190 | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP |