1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
11 #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
12 #define BOOST_COMPUTE_ALGORITHM_COPY_HPP
17 #include <boost/utility/enable_if.hpp>
19 #include <boost/mpl/and.hpp>
20 #include <boost/mpl/not.hpp>
21 #include <boost/mpl/or.hpp>
23 #include <boost/compute/buffer.hpp>
24 #include <boost/compute/system.hpp>
25 #include <boost/compute/command_queue.hpp>
26 #include <boost/compute/algorithm/detail/copy_on_device.hpp>
27 #include <boost/compute/algorithm/detail/copy_to_device.hpp>
28 #include <boost/compute/algorithm/detail/copy_to_host.hpp>
29 #include <boost/compute/async/future.hpp>
30 #include <boost/compute/container/mapped_view.hpp>
31 #include <boost/compute/detail/device_ptr.hpp>
32 #include <boost/compute/detail/is_contiguous_iterator.hpp>
33 #include <boost/compute/detail/iterator_range_size.hpp>
34 #include <boost/compute/detail/parameter_cache.hpp>
35 #include <boost/compute/iterator/buffer_iterator.hpp>
36 #include <boost/compute/type_traits/type_name.hpp>
37 #include <boost/compute/type_traits/is_device_iterator.hpp>
43 namespace mpl = boost::mpl;
45 // meta-function returning true if copy() between InputIterator and
46 // OutputIterator can be implemented with clEnqueueCopyBuffer().
47 template<class InputIterator, class OutputIterator>
48 struct can_copy_with_copy_buffer :
53 buffer_iterator<typename InputIterator::value_type>
57 detail::device_ptr<typename InputIterator::value_type>
63 buffer_iterator<typename OutputIterator::value_type>
67 detail::device_ptr<typename OutputIterator::value_type>
71 typename InputIterator::value_type,
72 typename OutputIterator::value_type
76 // meta-function returning true if value_types of HostIterator and
77 // DeviceIterator are same
78 template<class HostIterator, class DeviceIterator>
79 struct is_same_value_type :
81 typename boost::remove_cv<
82 typename std::iterator_traits<HostIterator>::value_type
84 typename boost::remove_cv<
85 typename DeviceIterator::value_type
89 // meta-function returning true if value_type of HostIterator is bool
90 template<class HostIterator>
91 struct is_bool_value_type :
93 typename boost::remove_cv<
94 typename std::iterator_traits<HostIterator>::value_type
99 // host -> device (async)
100 template<class InputIterator, class OutputIterator>
101 inline future<OutputIterator>
102 dispatch_copy_async(InputIterator first,
104 OutputIterator result,
105 command_queue &queue,
106 const wait_list &events,
107 typename boost::enable_if<
110 is_device_iterator<InputIterator>
112 is_device_iterator<OutputIterator>,
113 is_same_value_type<InputIterator, OutputIterator>
117 BOOST_STATIC_ASSERT_MSG(
118 is_contiguous_iterator<InputIterator>::value,
119 "copy_async() is only supported for contiguous host iterators"
122 return copy_to_device_async(first, last, result, queue, events);
125 // host -> device (async)
126 // Type mismatch between InputIterator and OutputIterator value_types
127 template<class InputIterator, class OutputIterator>
128 inline future<OutputIterator>
129 dispatch_copy_async(InputIterator first,
131 OutputIterator result,
132 command_queue &queue,
133 const wait_list &events,
134 typename boost::enable_if<
137 is_device_iterator<InputIterator>
139 is_device_iterator<OutputIterator>,
141 is_same_value_type<InputIterator, OutputIterator>
146 BOOST_STATIC_ASSERT_MSG(
147 is_contiguous_iterator<InputIterator>::value,
148 "copy_async() is only supported for contiguous host iterators"
151 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
153 const context &context = queue.get_context();
154 size_t count = iterator_range_size(first, last);
156 if(count < size_t(1)) {
157 return future<OutputIterator>();
160 // map [first; last) to device and run copy kernel
161 // on device for copying & casting
162 ::boost::compute::mapped_view<input_type> mapped_host(
163 // make sure it's a pointer to constant data
164 // to force read only mapping
165 const_cast<const input_type*>(
166 ::boost::addressof(*first)
171 return copy_on_device_async(
172 mapped_host.begin(), mapped_host.end(), result, queue, events
177 // InputIterator is a contiguous iterator
178 template<class InputIterator, class OutputIterator>
179 inline OutputIterator
180 dispatch_copy(InputIterator first,
182 OutputIterator result,
183 command_queue &queue,
184 const wait_list &events,
185 typename boost::enable_if<
188 is_device_iterator<InputIterator>
190 is_device_iterator<OutputIterator>,
191 is_same_value_type<InputIterator, OutputIterator>,
192 is_contiguous_iterator<InputIterator>
196 return copy_to_device(first, last, result, queue, events);
200 // Type mismatch between InputIterator and OutputIterator value_types
201 // InputIterator is a contiguous iterator
202 template<class InputIterator, class OutputIterator>
203 inline OutputIterator
204 dispatch_copy(InputIterator first,
206 OutputIterator result,
207 command_queue &queue,
208 const wait_list &events,
209 typename boost::enable_if<
212 is_device_iterator<InputIterator>
214 is_device_iterator<OutputIterator>,
216 is_same_value_type<InputIterator, OutputIterator>
218 is_contiguous_iterator<InputIterator>
222 typedef typename OutputIterator::value_type output_type;
223 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
225 const device &device = queue.get_device();
227 // loading parameters
228 std::string cache_key =
229 std::string("__boost_compute_copy_to_device_")
230 + type_name<input_type>() + "_" + type_name<output_type>();
231 boost::shared_ptr<parameter_cache> parameters =
232 detail::parameter_cache::get_global_cache(device);
234 uint_ map_copy_threshold;
235 uint_ direct_copy_threshold;
237 // calculate default values of thresholds
238 if (device.type() & device::gpu) {
240 map_copy_threshold = 524288; // 0.5 MB
241 direct_copy_threshold = 52428800; // 50 MB
244 // CPUs and other devices
245 map_copy_threshold = 134217728; // 128 MB
246 direct_copy_threshold = 0; // it's never efficient for CPUs
252 cache_key, "map_copy_threshold", map_copy_threshold
254 direct_copy_threshold =
256 cache_key, "direct_copy_threshold", direct_copy_threshold
259 // select copy method based on thresholds & input_size_bytes
260 size_t count = iterator_range_size(first, last);
261 size_t input_size_bytes = count * sizeof(input_type);
263 // [0; map_copy_threshold) -> copy_to_device_map()
264 if(input_size_bytes < map_copy_threshold) {
265 return copy_to_device_map(first, last, result, queue, events);
267 // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
268 // on host and then perform copy_to_device()
269 else if(input_size_bytes < direct_copy_threshold) {
270 std::vector<output_type> vector(first, last);
271 return copy_to_device(
272 vector.begin(), vector.end(), result, queue, events
276 // [direct_copy_threshold; inf) -> map [first; last) to device and
277 // run copy kernel on device for copying & casting
278 // At this point we are sure that count > 1 (first != last).
280 // Perform async copy to device, wait for it to be finished and
281 // return the result.
282 // At this point we are sure that count > 1 (first != last), so event
283 // returned by dispatch_copy_async() must be valid.
284 return dispatch_copy_async(first, last, result, queue, events).get();
288 // InputIterator is NOT a contiguous iterator
289 template<class InputIterator, class OutputIterator>
290 inline OutputIterator
291 dispatch_copy(InputIterator first,
293 OutputIterator result,
294 command_queue &queue,
295 const wait_list &events,
296 typename boost::enable_if<
299 is_device_iterator<InputIterator>
301 is_device_iterator<OutputIterator>,
303 is_contiguous_iterator<InputIterator>
308 typedef typename OutputIterator::value_type output_type;
309 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
311 const device &device = queue.get_device();
313 // loading parameters
314 std::string cache_key =
315 std::string("__boost_compute_copy_to_device_")
316 + type_name<input_type>() + "_" + type_name<output_type>();
317 boost::shared_ptr<parameter_cache> parameters =
318 detail::parameter_cache::get_global_cache(device);
320 uint_ map_copy_threshold;
321 uint_ direct_copy_threshold;
323 // calculate default values of thresholds
324 if (device.type() & device::gpu) {
326 map_copy_threshold = 524288; // 0.5 MB
327 direct_copy_threshold = 52428800; // 50 MB
330 // CPUs and other devices
331 map_copy_threshold = 134217728; // 128 MB
332 direct_copy_threshold = 0; // it's never efficient for CPUs
338 cache_key, "map_copy_threshold", map_copy_threshold
340 direct_copy_threshold =
342 cache_key, "direct_copy_threshold", direct_copy_threshold
345 // select copy method based on thresholds & input_size_bytes
346 size_t input_size = iterator_range_size(first, last);
347 size_t input_size_bytes = input_size * sizeof(input_type);
349 // [0; map_copy_threshold) -> copy_to_device_map()
351 // if direct_copy_threshold is less than map_copy_threshold
352 // copy_to_device_map() is used for every input
353 if(input_size_bytes < map_copy_threshold
354 || direct_copy_threshold <= map_copy_threshold) {
355 return copy_to_device_map(first, last, result, queue, events);
357 // [map_copy_threshold; inf) -> convert [first; last)
358 // on host and then perform copy_to_device()
359 std::vector<output_type> vector(first, last);
360 return copy_to_device(vector.begin(), vector.end(), result, queue, events);
363 // device -> host (async)
364 template<class InputIterator, class OutputIterator>
365 inline future<OutputIterator>
366 dispatch_copy_async(InputIterator first,
368 OutputIterator result,
369 command_queue &queue,
370 const wait_list &events,
371 typename boost::enable_if<
373 is_device_iterator<InputIterator>,
375 is_device_iterator<OutputIterator>
377 is_same_value_type<OutputIterator, InputIterator>
381 BOOST_STATIC_ASSERT_MSG(
382 is_contiguous_iterator<OutputIterator>::value,
383 "copy_async() is only supported for contiguous host iterators"
386 return copy_to_host_async(first, last, result, queue, events);
389 // device -> host (async)
390 // Type mismatch between InputIterator and OutputIterator value_types
391 template<class InputIterator, class OutputIterator>
392 inline future<OutputIterator>
393 dispatch_copy_async(InputIterator first,
395 OutputIterator result,
396 command_queue &queue,
397 const wait_list &events,
398 typename boost::enable_if<
400 is_device_iterator<InputIterator>,
402 is_device_iterator<OutputIterator>
405 is_same_value_type<OutputIterator, InputIterator>
410 BOOST_STATIC_ASSERT_MSG(
411 is_contiguous_iterator<OutputIterator>::value,
412 "copy_async() is only supported for contiguous host iterators"
415 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
416 const context &context = queue.get_context();
417 size_t count = iterator_range_size(first, last);
419 if(count < size_t(1)) {
420 return future<OutputIterator>();
423 // map host memory to device
426 count * sizeof(output_type),
427 buffer::write_only | buffer::use_host_ptr,
429 ::boost::addressof(*result)
432 // copy async on device
433 ::boost::compute::future<buffer_iterator<output_type> > future =
434 copy_on_device_async(
437 make_buffer_iterator<output_type>(mapped_host),
441 // update host memory asynchronously by maping and unmaping memory
443 void* ptr = queue.enqueue_map_buffer_async(
447 count * sizeof(output_type),
452 queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
453 return make_future(result + count, unmap_event);
457 // OutputIterator is a contiguous iterator
458 template<class InputIterator, class OutputIterator>
459 inline OutputIterator
460 dispatch_copy(InputIterator first,
462 OutputIterator result,
463 command_queue &queue,
464 const wait_list &events,
465 typename boost::enable_if<
467 is_device_iterator<InputIterator>,
469 is_device_iterator<OutputIterator>
471 is_same_value_type<OutputIterator, InputIterator>,
472 is_contiguous_iterator<OutputIterator>,
474 is_bool_value_type<OutputIterator>
479 return copy_to_host(first, last, result, queue, events);
483 // Type mismatch between InputIterator and OutputIterator value_types
484 // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
485 // is a boolean type.
486 template<class InputIterator, class OutputIterator>
487 inline OutputIterator
488 dispatch_copy(InputIterator first,
490 OutputIterator result,
491 command_queue &queue,
492 const wait_list &events,
493 typename boost::enable_if<
495 is_device_iterator<InputIterator>,
497 is_device_iterator<OutputIterator>
501 is_contiguous_iterator<OutputIterator>
503 is_bool_value_type<OutputIterator>
508 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
509 typedef typename InputIterator::value_type input_type;
511 const device &device = queue.get_device();
513 // loading parameters
514 std::string cache_key =
515 std::string("__boost_compute_copy_to_host_")
516 + type_name<input_type>() + "_" + type_name<output_type>();
517 boost::shared_ptr<parameter_cache> parameters =
518 detail::parameter_cache::get_global_cache(device);
520 uint_ map_copy_threshold;
521 uint_ direct_copy_threshold;
523 // calculate default values of thresholds
524 if (device.type() & device::gpu) {
526 map_copy_threshold = 33554432; // 30 MB
527 direct_copy_threshold = 0; // it's never efficient for GPUs
530 // CPUs and other devices
531 map_copy_threshold = 134217728; // 128 MB
532 direct_copy_threshold = 0; // it's never efficient for CPUs
538 cache_key, "map_copy_threshold", map_copy_threshold
540 direct_copy_threshold =
542 cache_key, "direct_copy_threshold", direct_copy_threshold
545 // select copy method based on thresholds & input_size_bytes
546 size_t count = iterator_range_size(first, last);
547 size_t input_size_bytes = count * sizeof(input_type);
549 // [0; map_copy_threshold) -> copy_to_host_map()
551 // if direct_copy_threshold is less than map_copy_threshold
552 // copy_to_host_map() is used for every input
553 if(input_size_bytes < map_copy_threshold
554 || direct_copy_threshold <= map_copy_threshold) {
555 return copy_to_host_map(first, last, result, queue, events);
557 // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
558 // then copy (and convert) to result using std::copy()
559 std::vector<input_type> vector(count);
560 copy_to_host(first, last, vector.begin(), queue, events);
561 return std::copy(vector.begin(), vector.end(), result);
565 // Type mismatch between InputIterator and OutputIterator value_types
566 // OutputIterator is a contiguous iterator
567 // value_type of OutputIterator is NOT a boolean type
568 template<class InputIterator, class OutputIterator>
569 inline OutputIterator
570 dispatch_copy(InputIterator first,
572 OutputIterator result,
573 command_queue &queue,
574 const wait_list &events,
575 typename boost::enable_if<
577 is_device_iterator<InputIterator>,
579 is_device_iterator<OutputIterator>
582 is_same_value_type<OutputIterator, InputIterator>
584 is_contiguous_iterator<OutputIterator>,
586 is_bool_value_type<OutputIterator>
591 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
592 typedef typename InputIterator::value_type input_type;
594 const device &device = queue.get_device();
596 // loading parameters
597 std::string cache_key =
598 std::string("__boost_compute_copy_to_host_")
599 + type_name<input_type>() + "_" + type_name<output_type>();
600 boost::shared_ptr<parameter_cache> parameters =
601 detail::parameter_cache::get_global_cache(device);
603 uint_ map_copy_threshold;
604 uint_ direct_copy_threshold;
606 // calculate default values of thresholds
607 if (device.type() & device::gpu) {
609 map_copy_threshold = 524288; // 0.5 MB
610 direct_copy_threshold = 52428800; // 50 MB
613 // CPUs and other devices
614 map_copy_threshold = 134217728; // 128 MB
615 direct_copy_threshold = 0; // it's never efficient for CPUs
621 cache_key, "map_copy_threshold", map_copy_threshold
623 direct_copy_threshold =
625 cache_key, "direct_copy_threshold", direct_copy_threshold
628 // select copy method based on thresholds & input_size_bytes
629 size_t count = iterator_range_size(first, last);
630 size_t input_size_bytes = count * sizeof(input_type);
632 // [0; map_copy_threshold) -> copy_to_host_map()
633 if(input_size_bytes < map_copy_threshold) {
634 return copy_to_host_map(first, last, result, queue, events);
636 // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
637 // temporary vector then copy (and convert) to result using std::copy()
638 else if(input_size_bytes < direct_copy_threshold) {
639 std::vector<input_type> vector(count);
640 copy_to_host(first, last, vector.begin(), queue, events);
641 return std::copy(vector.begin(), vector.end(), result);
644 // [direct_copy_threshold; inf) -> map [result; result + input_size) to
645 // device and run copy kernel on device for copying & casting
646 // map host memory to device.
648 // Perform async copy to host, wait for it to be finished and
649 // return the result.
650 // At this point we are sure that count > 1 (first != last), so event
651 // returned by dispatch_copy_async() must be valid.
652 return dispatch_copy_async(first, last, result, queue, events).get();
656 template<class InputIterator, class OutputIterator>
657 inline OutputIterator
658 dispatch_copy(InputIterator first,
660 OutputIterator result,
661 command_queue &queue,
662 const wait_list &events,
663 typename boost::enable_if<
665 is_device_iterator<InputIterator>,
666 is_device_iterator<OutputIterator>,
668 can_copy_with_copy_buffer<
669 InputIterator, OutputIterator
675 return copy_on_device(first, last, result, queue, events);
678 // device -> device (specialization for buffer iterators)
679 template<class InputIterator, class OutputIterator>
680 inline OutputIterator
681 dispatch_copy(InputIterator first,
683 OutputIterator result,
684 command_queue &queue,
685 const wait_list &events,
686 typename boost::enable_if<
688 is_device_iterator<InputIterator>,
689 is_device_iterator<OutputIterator>,
690 can_copy_with_copy_buffer<
691 InputIterator, OutputIterator
696 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
697 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
699 difference_type n = std::distance(first, last);
705 queue.enqueue_copy_buffer(first.get_buffer(),
707 first.get_index() * sizeof(value_type),
708 result.get_index() * sizeof(value_type),
709 static_cast<size_t>(n) * sizeof(value_type),
714 // device -> device (async)
715 template<class InputIterator, class OutputIterator>
716 inline future<OutputIterator>
717 dispatch_copy_async(InputIterator first,
719 OutputIterator result,
720 command_queue &queue,
721 const wait_list &events,
722 typename boost::enable_if<
724 is_device_iterator<InputIterator>,
725 is_device_iterator<OutputIterator>,
727 can_copy_with_copy_buffer<
728 InputIterator, OutputIterator
734 return copy_on_device_async(first, last, result, queue, events);
737 // device -> device (async, specialization for buffer iterators)
738 template<class InputIterator, class OutputIterator>
739 inline future<OutputIterator>
740 dispatch_copy_async(InputIterator first,
742 OutputIterator result,
743 command_queue &queue,
744 const wait_list &events,
745 typename boost::enable_if<
747 is_device_iterator<InputIterator>,
748 is_device_iterator<OutputIterator>,
749 can_copy_with_copy_buffer<
750 InputIterator, OutputIterator
755 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
756 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
758 difference_type n = std::distance(first, last);
761 return make_future(result, event());
765 queue.enqueue_copy_buffer(
768 first.get_index() * sizeof(value_type),
769 result.get_index() * sizeof(value_type),
770 static_cast<size_t>(n) * sizeof(value_type),
774 return make_future(result + n, event_);
778 template<class InputIterator, class OutputIterator>
779 inline OutputIterator
780 dispatch_copy(InputIterator first,
782 OutputIterator result,
783 command_queue &queue,
784 const wait_list &events,
785 typename boost::enable_if_c<
786 !is_device_iterator<InputIterator>::value &&
787 !is_device_iterator<OutputIterator>::value
793 return std::copy(first, last, result);
796 } // end detail namespace
798 /// Copies the values in the range [\p first, \p last) to the range
799 /// beginning at \p result.
801 /// The generic copy() function can be used for a variety of data
802 /// transfer tasks and provides a standard interface to the following
803 /// OpenCL functions:
805 /// \li \c clEnqueueReadBuffer()
806 /// \li \c clEnqueueWriteBuffer()
807 /// \li \c clEnqueueCopyBuffer()
809 /// Unlike the aforementioned OpenCL functions, copy() will also work
810 /// with non-contiguous data-structures (e.g. \c std::list<T>) as
811 /// well as with "fancy" iterators (e.g. transform_iterator).
813 /// \param first first element in the range to copy
814 /// \param last last element in the range to copy
815 /// \param result first element in the result range
816 /// \param queue command queue to perform the operation
818 /// \return \c OutputIterator to the end of the result range
820 /// For example, to copy an array of \c int values on the host to a vector on
823 /// // array on the host
824 /// int data[] = { 1, 2, 3, 4 };
826 /// // vector on the device
827 /// boost::compute::vector<int> vec(4, context);
829 /// // copy values to the device vector
830 /// boost::compute::copy(data, data + 4, vec.begin(), queue);
833 /// The copy algorithm can also be used with standard containers such as
834 /// \c std::vector<T>:
836 /// std::vector<int> host_vector = ...
837 /// boost::compute::vector<int> device_vector = ...
839 /// // copy from the host to the device
840 /// boost::compute::copy(
841 /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
844 /// // copy from the device to the host
845 /// boost::compute::copy(
846 /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
850 /// Space complexity: \Omega(1)
852 /// \see copy_n(), copy_if(), copy_async()
853 template<class InputIterator, class OutputIterator>
854 inline OutputIterator copy(InputIterator first,
856 OutputIterator result,
857 command_queue &queue = system::default_queue(),
858 const wait_list &events = wait_list())
860 return detail::dispatch_copy(first, last, result, queue, events);
863 /// Copies the values in the range [\p first, \p last) to the range
864 /// beginning at \p result. The copy is performed asynchronously.
867 template<class InputIterator, class OutputIterator>
868 inline future<OutputIterator>
869 copy_async(InputIterator first,
871 OutputIterator result,
872 command_queue &queue = system::default_queue(),
873 const wait_list &events = wait_list())
875 return detail::dispatch_copy_async(first, last, result, queue, events);
878 } // end compute namespace
879 } // end boost namespace
881 #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP