1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
11 #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
12 #define BOOST_COMPUTE_ALGORITHM_COPY_HPP
17 #include <boost/utility/enable_if.hpp>
19 #include <boost/mpl/and.hpp>
20 #include <boost/mpl/not.hpp>
21 #include <boost/mpl/or.hpp>
23 #include <boost/compute/buffer.hpp>
24 #include <boost/compute/system.hpp>
25 #include <boost/compute/command_queue.hpp>
26 #include <boost/compute/algorithm/detail/copy_on_device.hpp>
27 #include <boost/compute/algorithm/detail/copy_to_device.hpp>
28 #include <boost/compute/algorithm/detail/copy_to_host.hpp>
29 #include <boost/compute/async/future.hpp>
30 #include <boost/compute/container/mapped_view.hpp>
31 #include <boost/compute/detail/device_ptr.hpp>
32 #include <boost/compute/detail/is_contiguous_iterator.hpp>
33 #include <boost/compute/detail/iterator_range_size.hpp>
34 #include <boost/compute/detail/parameter_cache.hpp>
35 #include <boost/compute/iterator/buffer_iterator.hpp>
36 #include <boost/compute/type_traits/type_name.hpp>
37 #include <boost/compute/type_traits/is_device_iterator.hpp>
43 namespace mpl = boost::mpl;
45 // meta-function returning true if copy() between InputIterator and
46 // OutputIterator can be implemented with clEnqueueCopyBuffer().
47 template<class InputIterator, class OutputIterator>
48 struct can_copy_with_copy_buffer :
53 buffer_iterator<typename InputIterator::value_type>
57 detail::device_ptr<typename InputIterator::value_type>
63 buffer_iterator<typename OutputIterator::value_type>
67 detail::device_ptr<typename OutputIterator::value_type>
71 typename InputIterator::value_type,
72 typename OutputIterator::value_type
76 // meta-function returning true if value_types of HostIterator and
77 // DeviceIterator are same
78 template<class HostIterator, class DeviceIterator>
79 struct is_same_value_type :
81 typename boost::remove_cv<
82 typename std::iterator_traits<HostIterator>::value_type
84 typename boost::remove_cv<
85 typename DeviceIterator::value_type
89 // meta-function returning true if value_type of HostIterator is bool
90 template<class HostIterator>
91 struct is_bool_value_type :
93 typename boost::remove_cv<
94 typename std::iterator_traits<HostIterator>::value_type
99 // host -> device (async)
100 template<class InputIterator, class OutputIterator>
101 inline future<OutputIterator>
102 dispatch_copy_async(InputIterator first,
104 OutputIterator result,
105 command_queue &queue,
106 typename boost::enable_if<
109 is_device_iterator<InputIterator>
111 is_device_iterator<OutputIterator>,
112 is_same_value_type<InputIterator, OutputIterator>
116 BOOST_STATIC_ASSERT_MSG(
117 is_contiguous_iterator<InputIterator>::value,
118 "copy_async() is only supported for contiguous host iterators"
121 return copy_to_device_async(first, last, result, queue);
124 // host -> device (async)
125 // Type mismatch between InputIterator and OutputIterator value_types
126 template<class InputIterator, class OutputIterator>
127 inline future<OutputIterator>
128 dispatch_copy_async(InputIterator first,
130 OutputIterator result,
131 command_queue &queue,
132 typename boost::enable_if<
135 is_device_iterator<InputIterator>
137 is_device_iterator<OutputIterator>,
139 is_same_value_type<InputIterator, OutputIterator>
144 BOOST_STATIC_ASSERT_MSG(
145 is_contiguous_iterator<InputIterator>::value,
146 "copy_async() is only supported for contiguous host iterators"
149 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
151 const context &context = queue.get_context();
152 size_t count = iterator_range_size(first, last);
154 if(count < size_t(1)) {
155 return future<OutputIterator>();
158 // map [first; last) to device and run copy kernel
159 // on device for copying & casting
160 ::boost::compute::mapped_view<input_type> mapped_host(
161 // make sure it's a pointer to constant data
162 // to force read only mapping
163 const_cast<const input_type*>(
164 ::boost::addressof(*first)
169 return copy_on_device_async(
170 mapped_host.begin(), mapped_host.end(), result, queue
175 // InputIterator is a contiguous iterator
176 template<class InputIterator, class OutputIterator>
177 inline OutputIterator
178 dispatch_copy(InputIterator first,
180 OutputIterator result,
181 command_queue &queue,
182 typename boost::enable_if<
185 is_device_iterator<InputIterator>
187 is_device_iterator<OutputIterator>,
188 is_same_value_type<InputIterator, OutputIterator>,
189 is_contiguous_iterator<InputIterator>
193 return copy_to_device(first, last, result, queue);
197 // Type mismatch between InputIterator and OutputIterator value_types
198 // InputIterator is a contiguous iterator
199 template<class InputIterator, class OutputIterator>
200 inline OutputIterator
201 dispatch_copy(InputIterator first,
203 OutputIterator result,
204 command_queue &queue,
205 typename boost::enable_if<
208 is_device_iterator<InputIterator>
210 is_device_iterator<OutputIterator>,
212 is_same_value_type<InputIterator, OutputIterator>
214 is_contiguous_iterator<InputIterator>
218 typedef typename OutputIterator::value_type output_type;
219 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
221 const device &device = queue.get_device();
223 // loading parameters
224 std::string cache_key =
225 std::string("__boost_compute_copy_to_device_")
226 + type_name<input_type>() + "_" + type_name<output_type>();
227 boost::shared_ptr<parameter_cache> parameters =
228 detail::parameter_cache::get_global_cache(device);
230 uint_ map_copy_threshold;
231 uint_ direct_copy_threshold;
233 // calculate default values of thresholds
234 if (device.type() & device::gpu) {
236 map_copy_threshold = 524288; // 0.5 MB
237 direct_copy_threshold = 52428800; // 50 MB
240 // CPUs and other devices
241 map_copy_threshold = 134217728; // 128 MB
242 direct_copy_threshold = 0; // it's never efficient for CPUs
248 cache_key, "map_copy_threshold", map_copy_threshold
250 direct_copy_threshold =
252 cache_key, "direct_copy_threshold", direct_copy_threshold
255 // select copy method based on thresholds & input_size_bytes
256 size_t count = iterator_range_size(first, last);
257 size_t input_size_bytes = count * sizeof(input_type);
259 // [0; map_copy_threshold) -> copy_to_device_map()
260 if(input_size_bytes < map_copy_threshold) {
261 return copy_to_device_map(first, last, result, queue);
263 // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
264 // on host and then perform copy_to_device()
265 else if(input_size_bytes < direct_copy_threshold) {
266 std::vector<output_type> vector(first, last);
267 return copy_to_device(vector.begin(), vector.end(), result, queue);
270 // [direct_copy_threshold; inf) -> map [first; last) to device and
271 // run copy kernel on device for copying & casting
272 // At this point we are sure that count > 1 (first != last).
274 // Perform async copy to device, wait for it to be finished and
275 // return the result.
276 // At this point we are sure that count > 1 (first != last), so event
277 // returned by dispatch_copy_async() must be valid.
278 return dispatch_copy_async(first, last, result, queue).get();
282 // InputIterator is NOT a contiguous iterator
283 template<class InputIterator, class OutputIterator>
284 inline OutputIterator
285 dispatch_copy(InputIterator first,
287 OutputIterator result,
288 command_queue &queue,
289 typename boost::enable_if<
292 is_device_iterator<InputIterator>
294 is_device_iterator<OutputIterator>,
296 is_contiguous_iterator<InputIterator>
301 typedef typename OutputIterator::value_type output_type;
302 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
304 const device &device = queue.get_device();
306 // loading parameters
307 std::string cache_key =
308 std::string("__boost_compute_copy_to_device_")
309 + type_name<input_type>() + "_" + type_name<output_type>();
310 boost::shared_ptr<parameter_cache> parameters =
311 detail::parameter_cache::get_global_cache(device);
313 uint_ map_copy_threshold;
314 uint_ direct_copy_threshold;
316 // calculate default values of thresholds
317 if (device.type() & device::gpu) {
319 map_copy_threshold = 524288; // 0.5 MB
320 direct_copy_threshold = 52428800; // 50 MB
323 // CPUs and other devices
324 map_copy_threshold = 134217728; // 128 MB
325 direct_copy_threshold = 0; // it's never efficient for CPUs
331 cache_key, "map_copy_threshold", map_copy_threshold
333 direct_copy_threshold =
335 cache_key, "direct_copy_threshold", direct_copy_threshold
338 // select copy method based on thresholds & input_size_bytes
339 size_t input_size = iterator_range_size(first, last);
340 size_t input_size_bytes = input_size * sizeof(input_type);
342 // [0; map_copy_threshold) -> copy_to_device_map()
344 // if direct_copy_threshold is less than map_copy_threshold
345 // copy_to_device_map() is used for every input
346 if(input_size_bytes < map_copy_threshold
347 || direct_copy_threshold <= map_copy_threshold) {
348 return copy_to_device_map(first, last, result, queue);
350 // [map_copy_threshold; inf) -> convert [first; last)
351 // on host and then perform copy_to_device()
352 std::vector<output_type> vector(first, last);
353 return copy_to_device(vector.begin(), vector.end(), result, queue);
356 // device -> host (async)
357 template<class InputIterator, class OutputIterator>
358 inline future<OutputIterator>
359 dispatch_copy_async(InputIterator first,
361 OutputIterator result,
362 command_queue &queue,
363 typename boost::enable_if<
365 is_device_iterator<InputIterator>,
367 is_device_iterator<OutputIterator>
369 is_same_value_type<OutputIterator, InputIterator>
373 BOOST_STATIC_ASSERT_MSG(
374 is_contiguous_iterator<OutputIterator>::value,
375 "copy_async() is only supported for contiguous host iterators"
378 return copy_to_host_async(first, last, result, queue);
381 // device -> host (async)
382 // Type mismatch between InputIterator and OutputIterator value_types
383 template<class InputIterator, class OutputIterator>
384 inline future<OutputIterator>
385 dispatch_copy_async(InputIterator first,
387 OutputIterator result,
388 command_queue &queue,
389 typename boost::enable_if<
391 is_device_iterator<InputIterator>,
393 is_device_iterator<OutputIterator>
396 is_same_value_type<OutputIterator, InputIterator>
401 BOOST_STATIC_ASSERT_MSG(
402 is_contiguous_iterator<OutputIterator>::value,
403 "copy_async() is only supported for contiguous host iterators"
406 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
407 const context &context = queue.get_context();
408 size_t count = iterator_range_size(first, last);
410 if(count < size_t(1)) {
411 return future<OutputIterator>();
414 // map host memory to device
417 count * sizeof(output_type),
418 buffer::write_only | buffer::use_host_ptr,
420 ::boost::addressof(*result)
423 // copy async on device
424 ::boost::compute::future<buffer_iterator<output_type> > future =
425 copy_on_device_async(
428 make_buffer_iterator<output_type>(mapped_host),
431 // update host memory asynchronously by maping and unmaping memory
433 void* ptr = queue.enqueue_map_buffer_async(
437 count * sizeof(output_type),
442 queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
443 return make_future(result + count, unmap_event);
447 // OutputIterator is a contiguous iterator
448 template<class InputIterator, class OutputIterator>
449 inline OutputIterator
450 dispatch_copy(InputIterator first,
452 OutputIterator result,
453 command_queue &queue,
454 typename boost::enable_if<
456 is_device_iterator<InputIterator>,
458 is_device_iterator<OutputIterator>
460 is_same_value_type<OutputIterator, InputIterator>,
461 is_contiguous_iterator<OutputIterator>,
463 is_bool_value_type<OutputIterator>
468 return copy_to_host(first, last, result, queue);
472 // Type mismatch between InputIterator and OutputIterator value_types
473 // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
474 // is a boolean type.
475 template<class InputIterator, class OutputIterator>
476 inline OutputIterator
477 dispatch_copy(InputIterator first,
479 OutputIterator result,
480 command_queue &queue,
481 typename boost::enable_if<
483 is_device_iterator<InputIterator>,
485 is_device_iterator<OutputIterator>
489 is_contiguous_iterator<OutputIterator>
491 is_bool_value_type<OutputIterator>
496 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
497 typedef typename InputIterator::value_type input_type;
499 const device &device = queue.get_device();
501 // loading parameters
502 std::string cache_key =
503 std::string("__boost_compute_copy_to_host_")
504 + type_name<input_type>() + "_" + type_name<output_type>();
505 boost::shared_ptr<parameter_cache> parameters =
506 detail::parameter_cache::get_global_cache(device);
508 uint_ map_copy_threshold;
509 uint_ direct_copy_threshold;
511 // calculate default values of thresholds
512 if (device.type() & device::gpu) {
514 map_copy_threshold = 33554432; // 30 MB
515 direct_copy_threshold = 0; // it's never efficient for GPUs
518 // CPUs and other devices
519 map_copy_threshold = 134217728; // 128 MB
520 direct_copy_threshold = 0; // it's never efficient for CPUs
526 cache_key, "map_copy_threshold", map_copy_threshold
528 direct_copy_threshold =
530 cache_key, "direct_copy_threshold", direct_copy_threshold
533 // select copy method based on thresholds & input_size_bytes
534 size_t count = iterator_range_size(first, last);
535 size_t input_size_bytes = count * sizeof(input_type);
537 // [0; map_copy_threshold) -> copy_to_host_map()
539 // if direct_copy_threshold is less than map_copy_threshold
540 // copy_to_host_map() is used for every input
541 if(input_size_bytes < map_copy_threshold
542 || direct_copy_threshold <= map_copy_threshold) {
543 return copy_to_host_map(first, last, result, queue);
545 // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
546 // then copy (and convert) to result using std::copy()
547 std::vector<input_type> vector(count);
548 copy_to_host(first, last, vector.begin(), queue);
549 return std::copy(vector.begin(), vector.end(), result);
553 // Type mismatch between InputIterator and OutputIterator value_types
554 // OutputIterator is a contiguous iterator
555 // value_type of OutputIterator is NOT a boolean type
556 template<class InputIterator, class OutputIterator>
557 inline OutputIterator
558 dispatch_copy(InputIterator first,
560 OutputIterator result,
561 command_queue &queue,
562 typename boost::enable_if<
564 is_device_iterator<InputIterator>,
566 is_device_iterator<OutputIterator>
569 is_same_value_type<OutputIterator, InputIterator>
571 is_contiguous_iterator<OutputIterator>,
573 is_bool_value_type<OutputIterator>
578 typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
579 typedef typename InputIterator::value_type input_type;
581 const device &device = queue.get_device();
583 // loading parameters
584 std::string cache_key =
585 std::string("__boost_compute_copy_to_host_")
586 + type_name<input_type>() + "_" + type_name<output_type>();
587 boost::shared_ptr<parameter_cache> parameters =
588 detail::parameter_cache::get_global_cache(device);
590 uint_ map_copy_threshold;
591 uint_ direct_copy_threshold;
593 // calculate default values of thresholds
594 if (device.type() & device::gpu) {
596 map_copy_threshold = 524288; // 0.5 MB
597 direct_copy_threshold = 52428800; // 50 MB
600 // CPUs and other devices
601 map_copy_threshold = 134217728; // 128 MB
602 direct_copy_threshold = 0; // it's never efficient for CPUs
608 cache_key, "map_copy_threshold", map_copy_threshold
610 direct_copy_threshold =
612 cache_key, "direct_copy_threshold", direct_copy_threshold
615 // select copy method based on thresholds & input_size_bytes
616 size_t count = iterator_range_size(first, last);
617 size_t input_size_bytes = count * sizeof(input_type);
619 // [0; map_copy_threshold) -> copy_to_host_map()
620 if(input_size_bytes < map_copy_threshold) {
621 return copy_to_host_map(first, last, result, queue);
623 // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
624 // temporary vector then copy (and convert) to result using std::copy()
625 else if(input_size_bytes < direct_copy_threshold) {
626 std::vector<input_type> vector(count);
627 copy_to_host(first, last, vector.begin(), queue);
628 return std::copy(vector.begin(), vector.end(), result);
631 // [direct_copy_threshold; inf) -> map [result; result + input_size) to
632 // device and run copy kernel on device for copying & casting
633 // map host memory to device.
635 // Perform async copy to host, wait for it to be finished and
636 // return the result.
637 // At this point we are sure that count > 1 (first != last), so event
638 // returned by dispatch_copy_async() must be valid.
639 return dispatch_copy_async(first, last, result, queue).get();
643 template<class InputIterator, class OutputIterator>
644 inline OutputIterator
645 dispatch_copy(InputIterator first,
647 OutputIterator result,
648 command_queue &queue,
649 typename boost::enable_if<
651 is_device_iterator<InputIterator>,
652 is_device_iterator<OutputIterator>,
654 can_copy_with_copy_buffer<
655 InputIterator, OutputIterator
661 return copy_on_device(first, last, result, queue);
664 // device -> device (specialization for buffer iterators)
665 template<class InputIterator, class OutputIterator>
666 inline OutputIterator
667 dispatch_copy(InputIterator first,
669 OutputIterator result,
670 command_queue &queue,
671 typename boost::enable_if<
673 is_device_iterator<InputIterator>,
674 is_device_iterator<OutputIterator>,
675 can_copy_with_copy_buffer<
676 InputIterator, OutputIterator
681 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
682 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
684 difference_type n = std::distance(first, last);
690 queue.enqueue_copy_buffer(first.get_buffer(),
692 first.get_index() * sizeof(value_type),
693 result.get_index() * sizeof(value_type),
694 static_cast<size_t>(n) * sizeof(value_type));
698 // device -> device (async)
699 template<class InputIterator, class OutputIterator>
700 inline future<OutputIterator>
701 dispatch_copy_async(InputIterator first,
703 OutputIterator result,
704 command_queue &queue,
705 typename boost::enable_if<
707 is_device_iterator<InputIterator>,
708 is_device_iterator<OutputIterator>,
710 can_copy_with_copy_buffer<
711 InputIterator, OutputIterator
717 return copy_on_device_async(first, last, result, queue);
720 // device -> device (async, specialization for buffer iterators)
721 template<class InputIterator, class OutputIterator>
722 inline future<OutputIterator>
723 dispatch_copy_async(InputIterator first,
725 OutputIterator result,
726 command_queue &queue,
727 typename boost::enable_if<
729 is_device_iterator<InputIterator>,
730 is_device_iterator<OutputIterator>,
731 can_copy_with_copy_buffer<
732 InputIterator, OutputIterator
737 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
738 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
740 difference_type n = std::distance(first, last);
743 return make_future(result, event());
747 queue.enqueue_copy_buffer(
750 first.get_index() * sizeof(value_type),
751 result.get_index() * sizeof(value_type),
752 static_cast<size_t>(n) * sizeof(value_type)
755 return make_future(result + n, event_);
759 template<class InputIterator, class OutputIterator>
760 inline OutputIterator
761 dispatch_copy(InputIterator first,
763 OutputIterator result,
764 command_queue &queue,
765 typename boost::enable_if_c<
766 !is_device_iterator<InputIterator>::value &&
767 !is_device_iterator<OutputIterator>::value
772 return std::copy(first, last, result);
775 } // end detail namespace
777 /// Copies the values in the range [\p first, \p last) to the range
778 /// beginning at \p result.
780 /// The generic copy() function can be used for a variety of data
781 /// transfer tasks and provides a standard interface to the following
782 /// OpenCL functions:
784 /// \li \c clEnqueueReadBuffer()
785 /// \li \c clEnqueueWriteBuffer()
786 /// \li \c clEnqueueCopyBuffer()
788 /// Unlike the aforementioned OpenCL functions, copy() will also work
789 /// with non-contiguous data-structures (e.g. \c std::list<T>) as
790 /// well as with "fancy" iterators (e.g. transform_iterator).
792 /// \param first first element in the range to copy
793 /// \param last last element in the range to copy
794 /// \param result first element in the result range
795 /// \param queue command queue to perform the operation
797 /// \return \c OutputIterator to the end of the result range
799 /// For example, to copy an array of \c int values on the host to a vector on
802 /// // array on the host
803 /// int data[] = { 1, 2, 3, 4 };
805 /// // vector on the device
806 /// boost::compute::vector<int> vec(4, context);
808 /// // copy values to the device vector
809 /// boost::compute::copy(data, data + 4, vec.begin(), queue);
812 /// The copy algorithm can also be used with standard containers such as
813 /// \c std::vector<T>:
815 /// std::vector<int> host_vector = ...
816 /// boost::compute::vector<int> device_vector = ...
818 /// // copy from the host to the device
819 /// boost::compute::copy(
820 /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
823 /// // copy from the device to the host
824 /// boost::compute::copy(
825 /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
829 /// Space complexity: \Omega(1)
831 /// \see copy_n(), copy_if(), copy_async()
832 template<class InputIterator, class OutputIterator>
833 inline OutputIterator copy(InputIterator first,
835 OutputIterator result,
836 command_queue &queue = system::default_queue())
838 return detail::dispatch_copy(first, last, result, queue);
841 /// Copies the values in the range [\p first, \p last) to the range
842 /// beginning at \p result. The copy is performed asynchronously.
845 template<class InputIterator, class OutputIterator>
846 inline future<OutputIterator>
847 copy_async(InputIterator first,
849 OutputIterator result,
850 command_queue &queue = system::default_queue())
852 return detail::dispatch_copy_async(first, last, result, queue);
855 } // end compute namespace
856 } // end boost namespace
858 #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP