]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_COPY_HPP | |
13 | ||
14 | #include <algorithm> | |
15 | #include <iterator> | |
16 | ||
17 | #include <boost/utility/enable_if.hpp> | |
18 | ||
19 | #include <boost/mpl/and.hpp> | |
20 | #include <boost/mpl/not.hpp> | |
21 | #include <boost/mpl/or.hpp> | |
22 | ||
23 | #include <boost/compute/buffer.hpp> | |
24 | #include <boost/compute/system.hpp> | |
25 | #include <boost/compute/command_queue.hpp> | |
26 | #include <boost/compute/algorithm/detail/copy_on_device.hpp> | |
27 | #include <boost/compute/algorithm/detail/copy_to_device.hpp> | |
28 | #include <boost/compute/algorithm/detail/copy_to_host.hpp> | |
29 | #include <boost/compute/async/future.hpp> | |
30 | #include <boost/compute/container/mapped_view.hpp> | |
31 | #include <boost/compute/detail/device_ptr.hpp> | |
32 | #include <boost/compute/detail/is_contiguous_iterator.hpp> | |
33 | #include <boost/compute/detail/iterator_range_size.hpp> | |
34 | #include <boost/compute/detail/parameter_cache.hpp> | |
35 | #include <boost/compute/iterator/buffer_iterator.hpp> | |
36 | #include <boost/compute/type_traits/type_name.hpp> | |
37 | #include <boost/compute/type_traits/is_device_iterator.hpp> | |
38 | ||
39 | namespace boost { | |
40 | namespace compute { | |
41 | namespace detail { | |
42 | ||
43 | namespace mpl = boost::mpl; | |
44 | ||
45 | // meta-function returning true if copy() between InputIterator and | |
46 | // OutputIterator can be implemented with clEnqueueCopyBuffer(). | |
47 | template<class InputIterator, class OutputIterator> | |
48 | struct can_copy_with_copy_buffer : | |
49 | mpl::and_< | |
50 | mpl::or_< | |
51 | boost::is_same< | |
52 | InputIterator, | |
53 | buffer_iterator<typename InputIterator::value_type> | |
54 | >, | |
55 | boost::is_same< | |
56 | InputIterator, | |
57 | detail::device_ptr<typename InputIterator::value_type> | |
58 | > | |
59 | >, | |
60 | mpl::or_< | |
61 | boost::is_same< | |
62 | OutputIterator, | |
63 | buffer_iterator<typename OutputIterator::value_type> | |
64 | >, | |
65 | boost::is_same< | |
66 | OutputIterator, | |
67 | detail::device_ptr<typename OutputIterator::value_type> | |
68 | > | |
69 | >, | |
70 | boost::is_same< | |
71 | typename InputIterator::value_type, | |
72 | typename OutputIterator::value_type | |
73 | > | |
74 | >::type {}; | |
75 | ||
76 | // meta-function returning true if value_types of HostIterator and | |
77 | // DeviceIterator are same | |
78 | template<class HostIterator, class DeviceIterator> | |
79 | struct is_same_value_type : | |
80 | boost::is_same< | |
81 | typename boost::remove_cv< | |
82 | typename std::iterator_traits<HostIterator>::value_type | |
83 | >::type, | |
84 | typename boost::remove_cv< | |
85 | typename DeviceIterator::value_type | |
86 | >::type | |
87 | >::type {}; | |
88 | ||
89 | // meta-function returning true if value_type of HostIterator is bool | |
90 | template<class HostIterator> | |
91 | struct is_bool_value_type : | |
92 | boost::is_same< | |
93 | typename boost::remove_cv< | |
94 | typename std::iterator_traits<HostIterator>::value_type | |
95 | >::type, | |
96 | bool | |
97 | >::type {}; | |
98 | ||
99 | // host -> device (async) | |
100 | template<class InputIterator, class OutputIterator> | |
101 | inline future<OutputIterator> | |
102 | dispatch_copy_async(InputIterator first, | |
103 | InputIterator last, | |
104 | OutputIterator result, | |
105 | command_queue &queue, | |
92f5a8d4 | 106 | const wait_list &events, |
7c673cae FG |
107 | typename boost::enable_if< |
108 | mpl::and_< | |
109 | mpl::not_< | |
110 | is_device_iterator<InputIterator> | |
111 | >, | |
112 | is_device_iterator<OutputIterator>, | |
113 | is_same_value_type<InputIterator, OutputIterator> | |
114 | > | |
115 | >::type* = 0) | |
116 | { | |
117 | BOOST_STATIC_ASSERT_MSG( | |
118 | is_contiguous_iterator<InputIterator>::value, | |
119 | "copy_async() is only supported for contiguous host iterators" | |
120 | ); | |
121 | ||
92f5a8d4 | 122 | return copy_to_device_async(first, last, result, queue, events); |
7c673cae FG |
123 | } |
124 | ||
125 | // host -> device (async) | |
126 | // Type mismatch between InputIterator and OutputIterator value_types | |
127 | template<class InputIterator, class OutputIterator> | |
128 | inline future<OutputIterator> | |
129 | dispatch_copy_async(InputIterator first, | |
130 | InputIterator last, | |
131 | OutputIterator result, | |
132 | command_queue &queue, | |
92f5a8d4 | 133 | const wait_list &events, |
7c673cae FG |
134 | typename boost::enable_if< |
135 | mpl::and_< | |
136 | mpl::not_< | |
137 | is_device_iterator<InputIterator> | |
138 | >, | |
139 | is_device_iterator<OutputIterator>, | |
140 | mpl::not_< | |
141 | is_same_value_type<InputIterator, OutputIterator> | |
142 | > | |
143 | > | |
144 | >::type* = 0) | |
145 | { | |
146 | BOOST_STATIC_ASSERT_MSG( | |
147 | is_contiguous_iterator<InputIterator>::value, | |
148 | "copy_async() is only supported for contiguous host iterators" | |
149 | ); | |
150 | ||
151 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
152 | ||
153 | const context &context = queue.get_context(); | |
154 | size_t count = iterator_range_size(first, last); | |
155 | ||
156 | if(count < size_t(1)) { | |
157 | return future<OutputIterator>(); | |
158 | } | |
159 | ||
160 | // map [first; last) to device and run copy kernel | |
161 | // on device for copying & casting | |
162 | ::boost::compute::mapped_view<input_type> mapped_host( | |
163 | // make sure it's a pointer to constant data | |
164 | // to force read only mapping | |
165 | const_cast<const input_type*>( | |
166 | ::boost::addressof(*first) | |
167 | ), | |
168 | count, | |
169 | context | |
170 | ); | |
171 | return copy_on_device_async( | |
92f5a8d4 | 172 | mapped_host.begin(), mapped_host.end(), result, queue, events |
7c673cae FG |
173 | ); |
174 | } | |
175 | ||
176 | // host -> device | |
177 | // InputIterator is a contiguous iterator | |
178 | template<class InputIterator, class OutputIterator> | |
179 | inline OutputIterator | |
180 | dispatch_copy(InputIterator first, | |
181 | InputIterator last, | |
182 | OutputIterator result, | |
183 | command_queue &queue, | |
92f5a8d4 | 184 | const wait_list &events, |
7c673cae FG |
185 | typename boost::enable_if< |
186 | mpl::and_< | |
187 | mpl::not_< | |
188 | is_device_iterator<InputIterator> | |
189 | >, | |
190 | is_device_iterator<OutputIterator>, | |
191 | is_same_value_type<InputIterator, OutputIterator>, | |
192 | is_contiguous_iterator<InputIterator> | |
193 | > | |
194 | >::type* = 0) | |
195 | { | |
92f5a8d4 | 196 | return copy_to_device(first, last, result, queue, events); |
7c673cae FG |
197 | } |
198 | ||
199 | // host -> device | |
200 | // Type mismatch between InputIterator and OutputIterator value_types | |
201 | // InputIterator is a contiguous iterator | |
202 | template<class InputIterator, class OutputIterator> | |
203 | inline OutputIterator | |
204 | dispatch_copy(InputIterator first, | |
205 | InputIterator last, | |
206 | OutputIterator result, | |
207 | command_queue &queue, | |
92f5a8d4 | 208 | const wait_list &events, |
7c673cae FG |
209 | typename boost::enable_if< |
210 | mpl::and_< | |
211 | mpl::not_< | |
212 | is_device_iterator<InputIterator> | |
213 | >, | |
214 | is_device_iterator<OutputIterator>, | |
215 | mpl::not_< | |
216 | is_same_value_type<InputIterator, OutputIterator> | |
217 | >, | |
218 | is_contiguous_iterator<InputIterator> | |
219 | > | |
220 | >::type* = 0) | |
221 | { | |
222 | typedef typename OutputIterator::value_type output_type; | |
223 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
224 | ||
225 | const device &device = queue.get_device(); | |
226 | ||
227 | // loading parameters | |
228 | std::string cache_key = | |
229 | std::string("__boost_compute_copy_to_device_") | |
230 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
231 | boost::shared_ptr<parameter_cache> parameters = | |
232 | detail::parameter_cache::get_global_cache(device); | |
233 | ||
b32b8144 FG |
234 | uint_ map_copy_threshold; |
235 | uint_ direct_copy_threshold; | |
7c673cae FG |
236 | |
237 | // calculate default values of thresholds | |
238 | if (device.type() & device::gpu) { | |
239 | // GPUs | |
240 | map_copy_threshold = 524288; // 0.5 MB | |
241 | direct_copy_threshold = 52428800; // 50 MB | |
242 | } | |
243 | else { | |
244 | // CPUs and other devices | |
245 | map_copy_threshold = 134217728; // 128 MB | |
246 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
247 | } | |
248 | ||
249 | // load thresholds | |
250 | map_copy_threshold = | |
251 | parameters->get( | |
252 | cache_key, "map_copy_threshold", map_copy_threshold | |
253 | ); | |
254 | direct_copy_threshold = | |
255 | parameters->get( | |
256 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
257 | ); | |
258 | ||
259 | // select copy method based on thresholds & input_size_bytes | |
260 | size_t count = iterator_range_size(first, last); | |
261 | size_t input_size_bytes = count * sizeof(input_type); | |
262 | ||
263 | // [0; map_copy_threshold) -> copy_to_device_map() | |
264 | if(input_size_bytes < map_copy_threshold) { | |
92f5a8d4 | 265 | return copy_to_device_map(first, last, result, queue, events); |
7c673cae FG |
266 | } |
267 | // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) | |
268 | // on host and then perform copy_to_device() | |
269 | else if(input_size_bytes < direct_copy_threshold) { | |
270 | std::vector<output_type> vector(first, last); | |
92f5a8d4 TL |
271 | return copy_to_device( |
272 | vector.begin(), vector.end(), result, queue, events | |
273 | ); | |
7c673cae FG |
274 | } |
275 | ||
276 | // [direct_copy_threshold; inf) -> map [first; last) to device and | |
277 | // run copy kernel on device for copying & casting | |
278 | // At this point we are sure that count > 1 (first != last). | |
279 | ||
280 | // Perform async copy to device, wait for it to be finished and | |
281 | // return the result. | |
282 | // At this point we are sure that count > 1 (first != last), so event | |
283 | // returned by dispatch_copy_async() must be valid. | |
92f5a8d4 | 284 | return dispatch_copy_async(first, last, result, queue, events).get(); |
7c673cae FG |
285 | } |
286 | ||
287 | // host -> device | |
288 | // InputIterator is NOT a contiguous iterator | |
289 | template<class InputIterator, class OutputIterator> | |
290 | inline OutputIterator | |
291 | dispatch_copy(InputIterator first, | |
292 | InputIterator last, | |
293 | OutputIterator result, | |
294 | command_queue &queue, | |
92f5a8d4 | 295 | const wait_list &events, |
7c673cae FG |
296 | typename boost::enable_if< |
297 | mpl::and_< | |
298 | mpl::not_< | |
299 | is_device_iterator<InputIterator> | |
300 | >, | |
301 | is_device_iterator<OutputIterator>, | |
302 | mpl::not_< | |
303 | is_contiguous_iterator<InputIterator> | |
304 | > | |
305 | > | |
306 | >::type* = 0) | |
307 | { | |
308 | typedef typename OutputIterator::value_type output_type; | |
309 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
310 | ||
311 | const device &device = queue.get_device(); | |
312 | ||
313 | // loading parameters | |
314 | std::string cache_key = | |
315 | std::string("__boost_compute_copy_to_device_") | |
316 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
317 | boost::shared_ptr<parameter_cache> parameters = | |
318 | detail::parameter_cache::get_global_cache(device); | |
319 | ||
b32b8144 FG |
320 | uint_ map_copy_threshold; |
321 | uint_ direct_copy_threshold; | |
7c673cae FG |
322 | |
323 | // calculate default values of thresholds | |
324 | if (device.type() & device::gpu) { | |
325 | // GPUs | |
326 | map_copy_threshold = 524288; // 0.5 MB | |
327 | direct_copy_threshold = 52428800; // 50 MB | |
328 | } | |
329 | else { | |
330 | // CPUs and other devices | |
331 | map_copy_threshold = 134217728; // 128 MB | |
332 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
333 | } | |
334 | ||
335 | // load thresholds | |
336 | map_copy_threshold = | |
337 | parameters->get( | |
338 | cache_key, "map_copy_threshold", map_copy_threshold | |
339 | ); | |
340 | direct_copy_threshold = | |
341 | parameters->get( | |
342 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
343 | ); | |
344 | ||
345 | // select copy method based on thresholds & input_size_bytes | |
346 | size_t input_size = iterator_range_size(first, last); | |
347 | size_t input_size_bytes = input_size * sizeof(input_type); | |
348 | ||
349 | // [0; map_copy_threshold) -> copy_to_device_map() | |
350 | // | |
351 | // if direct_copy_threshold is less than map_copy_threshold | |
352 | // copy_to_device_map() is used for every input | |
353 | if(input_size_bytes < map_copy_threshold | |
354 | || direct_copy_threshold <= map_copy_threshold) { | |
92f5a8d4 | 355 | return copy_to_device_map(first, last, result, queue, events); |
7c673cae FG |
356 | } |
357 | // [map_copy_threshold; inf) -> convert [first; last) | |
358 | // on host and then perform copy_to_device() | |
359 | std::vector<output_type> vector(first, last); | |
92f5a8d4 | 360 | return copy_to_device(vector.begin(), vector.end(), result, queue, events); |
7c673cae FG |
361 | } |
362 | ||
363 | // device -> host (async) | |
364 | template<class InputIterator, class OutputIterator> | |
365 | inline future<OutputIterator> | |
366 | dispatch_copy_async(InputIterator first, | |
367 | InputIterator last, | |
368 | OutputIterator result, | |
369 | command_queue &queue, | |
92f5a8d4 | 370 | const wait_list &events, |
7c673cae FG |
371 | typename boost::enable_if< |
372 | mpl::and_< | |
373 | is_device_iterator<InputIterator>, | |
374 | mpl::not_< | |
375 | is_device_iterator<OutputIterator> | |
376 | >, | |
377 | is_same_value_type<OutputIterator, InputIterator> | |
378 | > | |
379 | >::type* = 0) | |
380 | { | |
381 | BOOST_STATIC_ASSERT_MSG( | |
382 | is_contiguous_iterator<OutputIterator>::value, | |
383 | "copy_async() is only supported for contiguous host iterators" | |
384 | ); | |
385 | ||
92f5a8d4 | 386 | return copy_to_host_async(first, last, result, queue, events); |
7c673cae FG |
387 | } |
388 | ||
389 | // device -> host (async) | |
390 | // Type mismatch between InputIterator and OutputIterator value_types | |
391 | template<class InputIterator, class OutputIterator> | |
392 | inline future<OutputIterator> | |
393 | dispatch_copy_async(InputIterator first, | |
394 | InputIterator last, | |
395 | OutputIterator result, | |
396 | command_queue &queue, | |
92f5a8d4 | 397 | const wait_list &events, |
7c673cae FG |
398 | typename boost::enable_if< |
399 | mpl::and_< | |
400 | is_device_iterator<InputIterator>, | |
401 | mpl::not_< | |
402 | is_device_iterator<OutputIterator> | |
403 | >, | |
404 | mpl::not_< | |
405 | is_same_value_type<OutputIterator, InputIterator> | |
406 | > | |
407 | > | |
408 | >::type* = 0) | |
409 | { | |
410 | BOOST_STATIC_ASSERT_MSG( | |
411 | is_contiguous_iterator<OutputIterator>::value, | |
412 | "copy_async() is only supported for contiguous host iterators" | |
413 | ); | |
414 | ||
415 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
416 | const context &context = queue.get_context(); | |
417 | size_t count = iterator_range_size(first, last); | |
418 | ||
419 | if(count < size_t(1)) { | |
420 | return future<OutputIterator>(); | |
421 | } | |
422 | ||
423 | // map host memory to device | |
424 | buffer mapped_host( | |
425 | context, | |
426 | count * sizeof(output_type), | |
427 | buffer::write_only | buffer::use_host_ptr, | |
428 | static_cast<void*>( | |
429 | ::boost::addressof(*result) | |
430 | ) | |
431 | ); | |
432 | // copy async on device | |
433 | ::boost::compute::future<buffer_iterator<output_type> > future = | |
434 | copy_on_device_async( | |
435 | first, | |
436 | last, | |
437 | make_buffer_iterator<output_type>(mapped_host), | |
92f5a8d4 TL |
438 | queue, |
439 | events | |
7c673cae FG |
440 | ); |
441 | // update host memory asynchronously by maping and unmaping memory | |
442 | event map_event; | |
443 | void* ptr = queue.enqueue_map_buffer_async( | |
444 | mapped_host, | |
445 | CL_MAP_READ, | |
446 | 0, | |
447 | count * sizeof(output_type), | |
448 | map_event, | |
449 | future.get_event() | |
450 | ); | |
451 | event unmap_event = | |
452 | queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); | |
453 | return make_future(result + count, unmap_event); | |
454 | } | |
455 | ||
456 | // device -> host | |
457 | // OutputIterator is a contiguous iterator | |
458 | template<class InputIterator, class OutputIterator> | |
459 | inline OutputIterator | |
460 | dispatch_copy(InputIterator first, | |
461 | InputIterator last, | |
462 | OutputIterator result, | |
463 | command_queue &queue, | |
92f5a8d4 | 464 | const wait_list &events, |
7c673cae FG |
465 | typename boost::enable_if< |
466 | mpl::and_< | |
467 | is_device_iterator<InputIterator>, | |
468 | mpl::not_< | |
469 | is_device_iterator<OutputIterator> | |
470 | >, | |
471 | is_same_value_type<OutputIterator, InputIterator>, | |
472 | is_contiguous_iterator<OutputIterator>, | |
473 | mpl::not_< | |
474 | is_bool_value_type<OutputIterator> | |
475 | > | |
476 | > | |
477 | >::type* = 0) | |
478 | { | |
92f5a8d4 | 479 | return copy_to_host(first, last, result, queue, events); |
7c673cae FG |
480 | } |
481 | ||
482 | // device -> host | |
483 | // Type mismatch between InputIterator and OutputIterator value_types | |
484 | // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator | |
485 | // is a boolean type. | |
486 | template<class InputIterator, class OutputIterator> | |
487 | inline OutputIterator | |
488 | dispatch_copy(InputIterator first, | |
489 | InputIterator last, | |
490 | OutputIterator result, | |
491 | command_queue &queue, | |
92f5a8d4 | 492 | const wait_list &events, |
7c673cae FG |
493 | typename boost::enable_if< |
494 | mpl::and_< | |
495 | is_device_iterator<InputIterator>, | |
496 | mpl::not_< | |
497 | is_device_iterator<OutputIterator> | |
498 | >, | |
499 | mpl::or_< | |
500 | mpl::not_< | |
501 | is_contiguous_iterator<OutputIterator> | |
502 | >, | |
503 | is_bool_value_type<OutputIterator> | |
504 | > | |
505 | > | |
506 | >::type* = 0) | |
507 | { | |
508 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
509 | typedef typename InputIterator::value_type input_type; | |
510 | ||
511 | const device &device = queue.get_device(); | |
512 | ||
513 | // loading parameters | |
514 | std::string cache_key = | |
515 | std::string("__boost_compute_copy_to_host_") | |
516 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
517 | boost::shared_ptr<parameter_cache> parameters = | |
518 | detail::parameter_cache::get_global_cache(device); | |
519 | ||
b32b8144 FG |
520 | uint_ map_copy_threshold; |
521 | uint_ direct_copy_threshold; | |
7c673cae FG |
522 | |
523 | // calculate default values of thresholds | |
524 | if (device.type() & device::gpu) { | |
525 | // GPUs | |
526 | map_copy_threshold = 33554432; // 30 MB | |
527 | direct_copy_threshold = 0; // it's never efficient for GPUs | |
528 | } | |
529 | else { | |
530 | // CPUs and other devices | |
531 | map_copy_threshold = 134217728; // 128 MB | |
532 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
533 | } | |
534 | ||
535 | // load thresholds | |
536 | map_copy_threshold = | |
537 | parameters->get( | |
538 | cache_key, "map_copy_threshold", map_copy_threshold | |
539 | ); | |
540 | direct_copy_threshold = | |
541 | parameters->get( | |
542 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
543 | ); | |
544 | ||
545 | // select copy method based on thresholds & input_size_bytes | |
546 | size_t count = iterator_range_size(first, last); | |
547 | size_t input_size_bytes = count * sizeof(input_type); | |
548 | ||
549 | // [0; map_copy_threshold) -> copy_to_host_map() | |
550 | // | |
551 | // if direct_copy_threshold is less than map_copy_threshold | |
552 | // copy_to_host_map() is used for every input | |
553 | if(input_size_bytes < map_copy_threshold | |
554 | || direct_copy_threshold <= map_copy_threshold) { | |
92f5a8d4 | 555 | return copy_to_host_map(first, last, result, queue, events); |
7c673cae FG |
556 | } |
557 | // [map_copy_threshold; inf) -> copy [first;last) to temporary vector | |
558 | // then copy (and convert) to result using std::copy() | |
559 | std::vector<input_type> vector(count); | |
92f5a8d4 | 560 | copy_to_host(first, last, vector.begin(), queue, events); |
7c673cae FG |
561 | return std::copy(vector.begin(), vector.end(), result); |
562 | } | |
563 | ||
564 | // device -> host | |
565 | // Type mismatch between InputIterator and OutputIterator value_types | |
566 | // OutputIterator is a contiguous iterator | |
567 | // value_type of OutputIterator is NOT a boolean type | |
568 | template<class InputIterator, class OutputIterator> | |
569 | inline OutputIterator | |
570 | dispatch_copy(InputIterator first, | |
571 | InputIterator last, | |
572 | OutputIterator result, | |
573 | command_queue &queue, | |
92f5a8d4 | 574 | const wait_list &events, |
7c673cae FG |
575 | typename boost::enable_if< |
576 | mpl::and_< | |
577 | is_device_iterator<InputIterator>, | |
578 | mpl::not_< | |
579 | is_device_iterator<OutputIterator> | |
580 | >, | |
581 | mpl::not_< | |
582 | is_same_value_type<OutputIterator, InputIterator> | |
583 | >, | |
584 | is_contiguous_iterator<OutputIterator>, | |
585 | mpl::not_< | |
586 | is_bool_value_type<OutputIterator> | |
587 | > | |
588 | > | |
589 | >::type* = 0) | |
590 | { | |
591 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
592 | typedef typename InputIterator::value_type input_type; | |
593 | ||
594 | const device &device = queue.get_device(); | |
595 | ||
596 | // loading parameters | |
597 | std::string cache_key = | |
598 | std::string("__boost_compute_copy_to_host_") | |
599 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
600 | boost::shared_ptr<parameter_cache> parameters = | |
601 | detail::parameter_cache::get_global_cache(device); | |
602 | ||
b32b8144 FG |
603 | uint_ map_copy_threshold; |
604 | uint_ direct_copy_threshold; | |
7c673cae FG |
605 | |
606 | // calculate default values of thresholds | |
607 | if (device.type() & device::gpu) { | |
608 | // GPUs | |
609 | map_copy_threshold = 524288; // 0.5 MB | |
610 | direct_copy_threshold = 52428800; // 50 MB | |
611 | } | |
612 | else { | |
613 | // CPUs and other devices | |
614 | map_copy_threshold = 134217728; // 128 MB | |
615 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
616 | } | |
617 | ||
618 | // load thresholds | |
619 | map_copy_threshold = | |
620 | parameters->get( | |
621 | cache_key, "map_copy_threshold", map_copy_threshold | |
622 | ); | |
623 | direct_copy_threshold = | |
624 | parameters->get( | |
625 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
626 | ); | |
627 | ||
628 | // select copy method based on thresholds & input_size_bytes | |
629 | size_t count = iterator_range_size(first, last); | |
630 | size_t input_size_bytes = count * sizeof(input_type); | |
631 | ||
632 | // [0; map_copy_threshold) -> copy_to_host_map() | |
633 | if(input_size_bytes < map_copy_threshold) { | |
92f5a8d4 | 634 | return copy_to_host_map(first, last, result, queue, events); |
7c673cae FG |
635 | } |
636 | // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to | |
637 | // temporary vector then copy (and convert) to result using std::copy() | |
638 | else if(input_size_bytes < direct_copy_threshold) { | |
639 | std::vector<input_type> vector(count); | |
92f5a8d4 | 640 | copy_to_host(first, last, vector.begin(), queue, events); |
7c673cae FG |
641 | return std::copy(vector.begin(), vector.end(), result); |
642 | } | |
643 | ||
644 | // [direct_copy_threshold; inf) -> map [result; result + input_size) to | |
645 | // device and run copy kernel on device for copying & casting | |
646 | // map host memory to device. | |
647 | ||
648 | // Perform async copy to host, wait for it to be finished and | |
649 | // return the result. | |
650 | // At this point we are sure that count > 1 (first != last), so event | |
651 | // returned by dispatch_copy_async() must be valid. | |
92f5a8d4 | 652 | return dispatch_copy_async(first, last, result, queue, events).get(); |
7c673cae FG |
653 | } |
654 | ||
655 | // device -> device | |
656 | template<class InputIterator, class OutputIterator> | |
657 | inline OutputIterator | |
658 | dispatch_copy(InputIterator first, | |
659 | InputIterator last, | |
660 | OutputIterator result, | |
661 | command_queue &queue, | |
92f5a8d4 | 662 | const wait_list &events, |
7c673cae FG |
663 | typename boost::enable_if< |
664 | mpl::and_< | |
665 | is_device_iterator<InputIterator>, | |
666 | is_device_iterator<OutputIterator>, | |
667 | mpl::not_< | |
668 | can_copy_with_copy_buffer< | |
669 | InputIterator, OutputIterator | |
670 | > | |
671 | > | |
672 | > | |
673 | >::type* = 0) | |
674 | { | |
92f5a8d4 | 675 | return copy_on_device(first, last, result, queue, events); |
7c673cae FG |
676 | } |
677 | ||
678 | // device -> device (specialization for buffer iterators) | |
679 | template<class InputIterator, class OutputIterator> | |
680 | inline OutputIterator | |
681 | dispatch_copy(InputIterator first, | |
682 | InputIterator last, | |
683 | OutputIterator result, | |
684 | command_queue &queue, | |
92f5a8d4 | 685 | const wait_list &events, |
7c673cae FG |
686 | typename boost::enable_if< |
687 | mpl::and_< | |
688 | is_device_iterator<InputIterator>, | |
689 | is_device_iterator<OutputIterator>, | |
690 | can_copy_with_copy_buffer< | |
691 | InputIterator, OutputIterator | |
692 | > | |
693 | > | |
694 | >::type* = 0) | |
695 | { | |
696 | typedef typename std::iterator_traits<InputIterator>::value_type value_type; | |
697 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
698 | ||
699 | difference_type n = std::distance(first, last); | |
700 | if(n < 1){ | |
701 | // nothing to copy | |
702 | return result; | |
703 | } | |
704 | ||
705 | queue.enqueue_copy_buffer(first.get_buffer(), | |
706 | result.get_buffer(), | |
707 | first.get_index() * sizeof(value_type), | |
708 | result.get_index() * sizeof(value_type), | |
92f5a8d4 TL |
709 | static_cast<size_t>(n) * sizeof(value_type), |
710 | events); | |
7c673cae FG |
711 | return result + n; |
712 | } | |
713 | ||
714 | // device -> device (async) | |
715 | template<class InputIterator, class OutputIterator> | |
716 | inline future<OutputIterator> | |
717 | dispatch_copy_async(InputIterator first, | |
718 | InputIterator last, | |
719 | OutputIterator result, | |
720 | command_queue &queue, | |
92f5a8d4 | 721 | const wait_list &events, |
7c673cae FG |
722 | typename boost::enable_if< |
723 | mpl::and_< | |
724 | is_device_iterator<InputIterator>, | |
725 | is_device_iterator<OutputIterator>, | |
726 | mpl::not_< | |
727 | can_copy_with_copy_buffer< | |
728 | InputIterator, OutputIterator | |
729 | > | |
730 | > | |
731 | > | |
732 | >::type* = 0) | |
733 | { | |
92f5a8d4 | 734 | return copy_on_device_async(first, last, result, queue, events); |
7c673cae FG |
735 | } |
736 | ||
737 | // device -> device (async, specialization for buffer iterators) | |
738 | template<class InputIterator, class OutputIterator> | |
739 | inline future<OutputIterator> | |
740 | dispatch_copy_async(InputIterator first, | |
741 | InputIterator last, | |
742 | OutputIterator result, | |
743 | command_queue &queue, | |
92f5a8d4 | 744 | const wait_list &events, |
7c673cae FG |
745 | typename boost::enable_if< |
746 | mpl::and_< | |
747 | is_device_iterator<InputIterator>, | |
748 | is_device_iterator<OutputIterator>, | |
749 | can_copy_with_copy_buffer< | |
750 | InputIterator, OutputIterator | |
751 | > | |
752 | > | |
753 | >::type* = 0) | |
754 | { | |
755 | typedef typename std::iterator_traits<InputIterator>::value_type value_type; | |
756 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
757 | ||
758 | difference_type n = std::distance(first, last); | |
759 | if(n < 1){ | |
760 | // nothing to copy | |
761 | return make_future(result, event()); | |
762 | } | |
763 | ||
764 | event event_ = | |
765 | queue.enqueue_copy_buffer( | |
766 | first.get_buffer(), | |
767 | result.get_buffer(), | |
768 | first.get_index() * sizeof(value_type), | |
769 | result.get_index() * sizeof(value_type), | |
92f5a8d4 TL |
770 | static_cast<size_t>(n) * sizeof(value_type), |
771 | events | |
7c673cae FG |
772 | ); |
773 | ||
774 | return make_future(result + n, event_); | |
775 | } | |
776 | ||
777 | // host -> host | |
778 | template<class InputIterator, class OutputIterator> | |
779 | inline OutputIterator | |
780 | dispatch_copy(InputIterator first, | |
781 | InputIterator last, | |
782 | OutputIterator result, | |
783 | command_queue &queue, | |
92f5a8d4 | 784 | const wait_list &events, |
7c673cae FG |
785 | typename boost::enable_if_c< |
786 | !is_device_iterator<InputIterator>::value && | |
787 | !is_device_iterator<OutputIterator>::value | |
788 | >::type* = 0) | |
789 | { | |
790 | (void) queue; | |
92f5a8d4 | 791 | (void) events; |
7c673cae FG |
792 | |
793 | return std::copy(first, last, result); | |
794 | } | |
795 | ||
796 | } // end detail namespace | |
797 | ||
798 | /// Copies the values in the range [\p first, \p last) to the range | |
799 | /// beginning at \p result. | |
800 | /// | |
801 | /// The generic copy() function can be used for a variety of data | |
802 | /// transfer tasks and provides a standard interface to the following | |
803 | /// OpenCL functions: | |
804 | /// | |
805 | /// \li \c clEnqueueReadBuffer() | |
806 | /// \li \c clEnqueueWriteBuffer() | |
807 | /// \li \c clEnqueueCopyBuffer() | |
808 | /// | |
809 | /// Unlike the aforementioned OpenCL functions, copy() will also work | |
810 | /// with non-contiguous data-structures (e.g. \c std::list<T>) as | |
811 | /// well as with "fancy" iterators (e.g. transform_iterator). | |
812 | /// | |
813 | /// \param first first element in the range to copy | |
814 | /// \param last last element in the range to copy | |
815 | /// \param result first element in the result range | |
816 | /// \param queue command queue to perform the operation | |
817 | /// | |
818 | /// \return \c OutputIterator to the end of the result range | |
819 | /// | |
820 | /// For example, to copy an array of \c int values on the host to a vector on | |
821 | /// the device: | |
822 | /// \code | |
823 | /// // array on the host | |
824 | /// int data[] = { 1, 2, 3, 4 }; | |
825 | /// | |
826 | /// // vector on the device | |
827 | /// boost::compute::vector<int> vec(4, context); | |
828 | /// | |
829 | /// // copy values to the device vector | |
830 | /// boost::compute::copy(data, data + 4, vec.begin(), queue); | |
831 | /// \endcode | |
832 | /// | |
833 | /// The copy algorithm can also be used with standard containers such as | |
834 | /// \c std::vector<T>: | |
835 | /// \code | |
836 | /// std::vector<int> host_vector = ... | |
837 | /// boost::compute::vector<int> device_vector = ... | |
838 | /// | |
839 | /// // copy from the host to the device | |
840 | /// boost::compute::copy( | |
841 | /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue | |
842 | /// ); | |
843 | /// | |
844 | /// // copy from the device to the host | |
845 | /// boost::compute::copy( | |
846 | /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue | |
847 | /// ); | |
848 | /// \endcode | |
849 | /// | |
b32b8144 FG |
850 | /// Space complexity: \Omega(1) |
851 | /// | |
7c673cae FG |
852 | /// \see copy_n(), copy_if(), copy_async() |
853 | template<class InputIterator, class OutputIterator> | |
854 | inline OutputIterator copy(InputIterator first, | |
855 | InputIterator last, | |
856 | OutputIterator result, | |
92f5a8d4 TL |
857 | command_queue &queue = system::default_queue(), |
858 | const wait_list &events = wait_list()) | |
7c673cae | 859 | { |
92f5a8d4 | 860 | return detail::dispatch_copy(first, last, result, queue, events); |
7c673cae FG |
861 | } |
862 | ||
863 | /// Copies the values in the range [\p first, \p last) to the range | |
864 | /// beginning at \p result. The copy is performed asynchronously. | |
865 | /// | |
866 | /// \see copy() | |
867 | template<class InputIterator, class OutputIterator> | |
868 | inline future<OutputIterator> | |
869 | copy_async(InputIterator first, | |
870 | InputIterator last, | |
871 | OutputIterator result, | |
92f5a8d4 TL |
872 | command_queue &queue = system::default_queue(), |
873 | const wait_list &events = wait_list()) | |
7c673cae | 874 | { |
92f5a8d4 | 875 | return detail::dispatch_copy_async(first, last, result, queue, events); |
7c673cae FG |
876 | } |
877 | ||
878 | } // end compute namespace | |
879 | } // end boost namespace | |
880 | ||
881 | #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP |