]>
Commit | Line | Data |
---|---|---|
1 | //---------------------------------------------------------------------------// | |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_COPY_HPP | |
13 | ||
14 | #include <algorithm> | |
15 | #include <iterator> | |
16 | ||
17 | #include <boost/utility/enable_if.hpp> | |
18 | ||
19 | #include <boost/mpl/and.hpp> | |
20 | #include <boost/mpl/not.hpp> | |
21 | #include <boost/mpl/or.hpp> | |
22 | ||
23 | #include <boost/compute/buffer.hpp> | |
24 | #include <boost/compute/system.hpp> | |
25 | #include <boost/compute/command_queue.hpp> | |
26 | #include <boost/compute/algorithm/detail/copy_on_device.hpp> | |
27 | #include <boost/compute/algorithm/detail/copy_to_device.hpp> | |
28 | #include <boost/compute/algorithm/detail/copy_to_host.hpp> | |
29 | #include <boost/compute/async/future.hpp> | |
30 | #include <boost/compute/container/mapped_view.hpp> | |
31 | #include <boost/compute/detail/device_ptr.hpp> | |
32 | #include <boost/compute/detail/is_contiguous_iterator.hpp> | |
33 | #include <boost/compute/detail/iterator_range_size.hpp> | |
34 | #include <boost/compute/detail/parameter_cache.hpp> | |
35 | #include <boost/compute/iterator/buffer_iterator.hpp> | |
36 | #include <boost/compute/type_traits/type_name.hpp> | |
37 | #include <boost/compute/type_traits/is_device_iterator.hpp> | |
38 | ||
39 | namespace boost { | |
40 | namespace compute { | |
41 | namespace detail { | |
42 | ||
43 | namespace mpl = boost::mpl; | |
44 | ||
45 | // meta-function returning true if copy() between InputIterator and | |
46 | // OutputIterator can be implemented with clEnqueueCopyBuffer(). | |
47 | template<class InputIterator, class OutputIterator> | |
48 | struct can_copy_with_copy_buffer : | |
49 | mpl::and_< | |
50 | mpl::or_< | |
51 | boost::is_same< | |
52 | InputIterator, | |
53 | buffer_iterator<typename InputIterator::value_type> | |
54 | >, | |
55 | boost::is_same< | |
56 | InputIterator, | |
57 | detail::device_ptr<typename InputIterator::value_type> | |
58 | > | |
59 | >, | |
60 | mpl::or_< | |
61 | boost::is_same< | |
62 | OutputIterator, | |
63 | buffer_iterator<typename OutputIterator::value_type> | |
64 | >, | |
65 | boost::is_same< | |
66 | OutputIterator, | |
67 | detail::device_ptr<typename OutputIterator::value_type> | |
68 | > | |
69 | >, | |
70 | boost::is_same< | |
71 | typename InputIterator::value_type, | |
72 | typename OutputIterator::value_type | |
73 | > | |
74 | >::type {}; | |
75 | ||
76 | // meta-function returning true if value_types of HostIterator and | |
77 | // DeviceIterator are same | |
78 | template<class HostIterator, class DeviceIterator> | |
79 | struct is_same_value_type : | |
80 | boost::is_same< | |
81 | typename boost::remove_cv< | |
82 | typename std::iterator_traits<HostIterator>::value_type | |
83 | >::type, | |
84 | typename boost::remove_cv< | |
85 | typename DeviceIterator::value_type | |
86 | >::type | |
87 | >::type {}; | |
88 | ||
89 | // meta-function returning true if value_type of HostIterator is bool | |
90 | template<class HostIterator> | |
91 | struct is_bool_value_type : | |
92 | boost::is_same< | |
93 | typename boost::remove_cv< | |
94 | typename std::iterator_traits<HostIterator>::value_type | |
95 | >::type, | |
96 | bool | |
97 | >::type {}; | |
98 | ||
99 | // host -> device (async) | |
100 | template<class InputIterator, class OutputIterator> | |
101 | inline future<OutputIterator> | |
102 | dispatch_copy_async(InputIterator first, | |
103 | InputIterator last, | |
104 | OutputIterator result, | |
105 | command_queue &queue, | |
106 | typename boost::enable_if< | |
107 | mpl::and_< | |
108 | mpl::not_< | |
109 | is_device_iterator<InputIterator> | |
110 | >, | |
111 | is_device_iterator<OutputIterator>, | |
112 | is_same_value_type<InputIterator, OutputIterator> | |
113 | > | |
114 | >::type* = 0) | |
115 | { | |
116 | BOOST_STATIC_ASSERT_MSG( | |
117 | is_contiguous_iterator<InputIterator>::value, | |
118 | "copy_async() is only supported for contiguous host iterators" | |
119 | ); | |
120 | ||
121 | return copy_to_device_async(first, last, result, queue); | |
122 | } | |
123 | ||
124 | // host -> device (async) | |
125 | // Type mismatch between InputIterator and OutputIterator value_types | |
126 | template<class InputIterator, class OutputIterator> | |
127 | inline future<OutputIterator> | |
128 | dispatch_copy_async(InputIterator first, | |
129 | InputIterator last, | |
130 | OutputIterator result, | |
131 | command_queue &queue, | |
132 | typename boost::enable_if< | |
133 | mpl::and_< | |
134 | mpl::not_< | |
135 | is_device_iterator<InputIterator> | |
136 | >, | |
137 | is_device_iterator<OutputIterator>, | |
138 | mpl::not_< | |
139 | is_same_value_type<InputIterator, OutputIterator> | |
140 | > | |
141 | > | |
142 | >::type* = 0) | |
143 | { | |
144 | BOOST_STATIC_ASSERT_MSG( | |
145 | is_contiguous_iterator<InputIterator>::value, | |
146 | "copy_async() is only supported for contiguous host iterators" | |
147 | ); | |
148 | ||
149 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
150 | ||
151 | const context &context = queue.get_context(); | |
152 | size_t count = iterator_range_size(first, last); | |
153 | ||
154 | if(count < size_t(1)) { | |
155 | return future<OutputIterator>(); | |
156 | } | |
157 | ||
158 | // map [first; last) to device and run copy kernel | |
159 | // on device for copying & casting | |
160 | ::boost::compute::mapped_view<input_type> mapped_host( | |
161 | // make sure it's a pointer to constant data | |
162 | // to force read only mapping | |
163 | const_cast<const input_type*>( | |
164 | ::boost::addressof(*first) | |
165 | ), | |
166 | count, | |
167 | context | |
168 | ); | |
169 | return copy_on_device_async( | |
170 | mapped_host.begin(), mapped_host.end(), result, queue | |
171 | ); | |
172 | } | |
173 | ||
174 | // host -> device | |
175 | // InputIterator is a contiguous iterator | |
176 | template<class InputIterator, class OutputIterator> | |
177 | inline OutputIterator | |
178 | dispatch_copy(InputIterator first, | |
179 | InputIterator last, | |
180 | OutputIterator result, | |
181 | command_queue &queue, | |
182 | typename boost::enable_if< | |
183 | mpl::and_< | |
184 | mpl::not_< | |
185 | is_device_iterator<InputIterator> | |
186 | >, | |
187 | is_device_iterator<OutputIterator>, | |
188 | is_same_value_type<InputIterator, OutputIterator>, | |
189 | is_contiguous_iterator<InputIterator> | |
190 | > | |
191 | >::type* = 0) | |
192 | { | |
193 | return copy_to_device(first, last, result, queue); | |
194 | } | |
195 | ||
196 | // host -> device | |
197 | // Type mismatch between InputIterator and OutputIterator value_types | |
198 | // InputIterator is a contiguous iterator | |
199 | template<class InputIterator, class OutputIterator> | |
200 | inline OutputIterator | |
201 | dispatch_copy(InputIterator first, | |
202 | InputIterator last, | |
203 | OutputIterator result, | |
204 | command_queue &queue, | |
205 | typename boost::enable_if< | |
206 | mpl::and_< | |
207 | mpl::not_< | |
208 | is_device_iterator<InputIterator> | |
209 | >, | |
210 | is_device_iterator<OutputIterator>, | |
211 | mpl::not_< | |
212 | is_same_value_type<InputIterator, OutputIterator> | |
213 | >, | |
214 | is_contiguous_iterator<InputIterator> | |
215 | > | |
216 | >::type* = 0) | |
217 | { | |
218 | typedef typename OutputIterator::value_type output_type; | |
219 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
220 | ||
221 | const device &device = queue.get_device(); | |
222 | ||
223 | // loading parameters | |
224 | std::string cache_key = | |
225 | std::string("__boost_compute_copy_to_device_") | |
226 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
227 | boost::shared_ptr<parameter_cache> parameters = | |
228 | detail::parameter_cache::get_global_cache(device); | |
229 | ||
230 | size_t map_copy_threshold; | |
231 | size_t direct_copy_threshold; | |
232 | ||
233 | // calculate default values of thresholds | |
234 | if (device.type() & device::gpu) { | |
235 | // GPUs | |
236 | map_copy_threshold = 524288; // 0.5 MB | |
237 | direct_copy_threshold = 52428800; // 50 MB | |
238 | } | |
239 | else { | |
240 | // CPUs and other devices | |
241 | map_copy_threshold = 134217728; // 128 MB | |
242 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
243 | } | |
244 | ||
245 | // load thresholds | |
246 | map_copy_threshold = | |
247 | parameters->get( | |
248 | cache_key, "map_copy_threshold", map_copy_threshold | |
249 | ); | |
250 | direct_copy_threshold = | |
251 | parameters->get( | |
252 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
253 | ); | |
254 | ||
255 | // select copy method based on thresholds & input_size_bytes | |
256 | size_t count = iterator_range_size(first, last); | |
257 | size_t input_size_bytes = count * sizeof(input_type); | |
258 | ||
259 | // [0; map_copy_threshold) -> copy_to_device_map() | |
260 | if(input_size_bytes < map_copy_threshold) { | |
261 | return copy_to_device_map(first, last, result, queue); | |
262 | } | |
263 | // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) | |
264 | // on host and then perform copy_to_device() | |
265 | else if(input_size_bytes < direct_copy_threshold) { | |
266 | std::vector<output_type> vector(first, last); | |
267 | return copy_to_device(vector.begin(), vector.end(), result, queue); | |
268 | } | |
269 | ||
270 | // [direct_copy_threshold; inf) -> map [first; last) to device and | |
271 | // run copy kernel on device for copying & casting | |
272 | // At this point we are sure that count > 1 (first != last). | |
273 | ||
274 | // Perform async copy to device, wait for it to be finished and | |
275 | // return the result. | |
276 | // At this point we are sure that count > 1 (first != last), so event | |
277 | // returned by dispatch_copy_async() must be valid. | |
278 | return dispatch_copy_async(first, last, result, queue).get(); | |
279 | } | |
280 | ||
281 | // host -> device | |
282 | // InputIterator is NOT a contiguous iterator | |
283 | template<class InputIterator, class OutputIterator> | |
284 | inline OutputIterator | |
285 | dispatch_copy(InputIterator first, | |
286 | InputIterator last, | |
287 | OutputIterator result, | |
288 | command_queue &queue, | |
289 | typename boost::enable_if< | |
290 | mpl::and_< | |
291 | mpl::not_< | |
292 | is_device_iterator<InputIterator> | |
293 | >, | |
294 | is_device_iterator<OutputIterator>, | |
295 | mpl::not_< | |
296 | is_contiguous_iterator<InputIterator> | |
297 | > | |
298 | > | |
299 | >::type* = 0) | |
300 | { | |
301 | typedef typename OutputIterator::value_type output_type; | |
302 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
303 | ||
304 | const device &device = queue.get_device(); | |
305 | ||
306 | // loading parameters | |
307 | std::string cache_key = | |
308 | std::string("__boost_compute_copy_to_device_") | |
309 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
310 | boost::shared_ptr<parameter_cache> parameters = | |
311 | detail::parameter_cache::get_global_cache(device); | |
312 | ||
313 | size_t map_copy_threshold; | |
314 | size_t direct_copy_threshold; | |
315 | ||
316 | // calculate default values of thresholds | |
317 | if (device.type() & device::gpu) { | |
318 | // GPUs | |
319 | map_copy_threshold = 524288; // 0.5 MB | |
320 | direct_copy_threshold = 52428800; // 50 MB | |
321 | } | |
322 | else { | |
323 | // CPUs and other devices | |
324 | map_copy_threshold = 134217728; // 128 MB | |
325 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
326 | } | |
327 | ||
328 | // load thresholds | |
329 | map_copy_threshold = | |
330 | parameters->get( | |
331 | cache_key, "map_copy_threshold", map_copy_threshold | |
332 | ); | |
333 | direct_copy_threshold = | |
334 | parameters->get( | |
335 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
336 | ); | |
337 | ||
338 | // select copy method based on thresholds & input_size_bytes | |
339 | size_t input_size = iterator_range_size(first, last); | |
340 | size_t input_size_bytes = input_size * sizeof(input_type); | |
341 | ||
342 | // [0; map_copy_threshold) -> copy_to_device_map() | |
343 | // | |
344 | // if direct_copy_threshold is less than map_copy_threshold | |
345 | // copy_to_device_map() is used for every input | |
346 | if(input_size_bytes < map_copy_threshold | |
347 | || direct_copy_threshold <= map_copy_threshold) { | |
348 | return copy_to_device_map(first, last, result, queue); | |
349 | } | |
350 | // [map_copy_threshold; inf) -> convert [first; last) | |
351 | // on host and then perform copy_to_device() | |
352 | std::vector<output_type> vector(first, last); | |
353 | return copy_to_device(vector.begin(), vector.end(), result, queue); | |
354 | } | |
355 | ||
356 | // device -> host (async) | |
357 | template<class InputIterator, class OutputIterator> | |
358 | inline future<OutputIterator> | |
359 | dispatch_copy_async(InputIterator first, | |
360 | InputIterator last, | |
361 | OutputIterator result, | |
362 | command_queue &queue, | |
363 | typename boost::enable_if< | |
364 | mpl::and_< | |
365 | is_device_iterator<InputIterator>, | |
366 | mpl::not_< | |
367 | is_device_iterator<OutputIterator> | |
368 | >, | |
369 | is_same_value_type<OutputIterator, InputIterator> | |
370 | > | |
371 | >::type* = 0) | |
372 | { | |
373 | BOOST_STATIC_ASSERT_MSG( | |
374 | is_contiguous_iterator<OutputIterator>::value, | |
375 | "copy_async() is only supported for contiguous host iterators" | |
376 | ); | |
377 | ||
378 | return copy_to_host_async(first, last, result, queue); | |
379 | } | |
380 | ||
381 | // device -> host (async) | |
382 | // Type mismatch between InputIterator and OutputIterator value_types | |
383 | template<class InputIterator, class OutputIterator> | |
384 | inline future<OutputIterator> | |
385 | dispatch_copy_async(InputIterator first, | |
386 | InputIterator last, | |
387 | OutputIterator result, | |
388 | command_queue &queue, | |
389 | typename boost::enable_if< | |
390 | mpl::and_< | |
391 | is_device_iterator<InputIterator>, | |
392 | mpl::not_< | |
393 | is_device_iterator<OutputIterator> | |
394 | >, | |
395 | mpl::not_< | |
396 | is_same_value_type<OutputIterator, InputIterator> | |
397 | > | |
398 | > | |
399 | >::type* = 0) | |
400 | { | |
401 | BOOST_STATIC_ASSERT_MSG( | |
402 | is_contiguous_iterator<OutputIterator>::value, | |
403 | "copy_async() is only supported for contiguous host iterators" | |
404 | ); | |
405 | ||
406 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
407 | const context &context = queue.get_context(); | |
408 | size_t count = iterator_range_size(first, last); | |
409 | ||
410 | if(count < size_t(1)) { | |
411 | return future<OutputIterator>(); | |
412 | } | |
413 | ||
414 | // map host memory to device | |
415 | buffer mapped_host( | |
416 | context, | |
417 | count * sizeof(output_type), | |
418 | buffer::write_only | buffer::use_host_ptr, | |
419 | static_cast<void*>( | |
420 | ::boost::addressof(*result) | |
421 | ) | |
422 | ); | |
423 | // copy async on device | |
424 | ::boost::compute::future<buffer_iterator<output_type> > future = | |
425 | copy_on_device_async( | |
426 | first, | |
427 | last, | |
428 | make_buffer_iterator<output_type>(mapped_host), | |
429 | queue | |
430 | ); | |
431 | // update host memory asynchronously by maping and unmaping memory | |
432 | event map_event; | |
433 | void* ptr = queue.enqueue_map_buffer_async( | |
434 | mapped_host, | |
435 | CL_MAP_READ, | |
436 | 0, | |
437 | count * sizeof(output_type), | |
438 | map_event, | |
439 | future.get_event() | |
440 | ); | |
441 | event unmap_event = | |
442 | queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); | |
443 | return make_future(result + count, unmap_event); | |
444 | } | |
445 | ||
446 | // device -> host | |
447 | // OutputIterator is a contiguous iterator | |
448 | template<class InputIterator, class OutputIterator> | |
449 | inline OutputIterator | |
450 | dispatch_copy(InputIterator first, | |
451 | InputIterator last, | |
452 | OutputIterator result, | |
453 | command_queue &queue, | |
454 | typename boost::enable_if< | |
455 | mpl::and_< | |
456 | is_device_iterator<InputIterator>, | |
457 | mpl::not_< | |
458 | is_device_iterator<OutputIterator> | |
459 | >, | |
460 | is_same_value_type<OutputIterator, InputIterator>, | |
461 | is_contiguous_iterator<OutputIterator>, | |
462 | mpl::not_< | |
463 | is_bool_value_type<OutputIterator> | |
464 | > | |
465 | > | |
466 | >::type* = 0) | |
467 | { | |
468 | return copy_to_host(first, last, result, queue); | |
469 | } | |
470 | ||
471 | // device -> host | |
472 | // Type mismatch between InputIterator and OutputIterator value_types | |
473 | // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator | |
474 | // is a boolean type. | |
475 | template<class InputIterator, class OutputIterator> | |
476 | inline OutputIterator | |
477 | dispatch_copy(InputIterator first, | |
478 | InputIterator last, | |
479 | OutputIterator result, | |
480 | command_queue &queue, | |
481 | typename boost::enable_if< | |
482 | mpl::and_< | |
483 | is_device_iterator<InputIterator>, | |
484 | mpl::not_< | |
485 | is_device_iterator<OutputIterator> | |
486 | >, | |
487 | mpl::or_< | |
488 | mpl::not_< | |
489 | is_contiguous_iterator<OutputIterator> | |
490 | >, | |
491 | is_bool_value_type<OutputIterator> | |
492 | > | |
493 | > | |
494 | >::type* = 0) | |
495 | { | |
496 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
497 | typedef typename InputIterator::value_type input_type; | |
498 | ||
499 | const device &device = queue.get_device(); | |
500 | ||
501 | // loading parameters | |
502 | std::string cache_key = | |
503 | std::string("__boost_compute_copy_to_host_") | |
504 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
505 | boost::shared_ptr<parameter_cache> parameters = | |
506 | detail::parameter_cache::get_global_cache(device); | |
507 | ||
508 | size_t map_copy_threshold; | |
509 | size_t direct_copy_threshold; | |
510 | ||
511 | // calculate default values of thresholds | |
512 | if (device.type() & device::gpu) { | |
513 | // GPUs | |
514 | map_copy_threshold = 33554432; // 30 MB | |
515 | direct_copy_threshold = 0; // it's never efficient for GPUs | |
516 | } | |
517 | else { | |
518 | // CPUs and other devices | |
519 | map_copy_threshold = 134217728; // 128 MB | |
520 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
521 | } | |
522 | ||
523 | // load thresholds | |
524 | map_copy_threshold = | |
525 | parameters->get( | |
526 | cache_key, "map_copy_threshold", map_copy_threshold | |
527 | ); | |
528 | direct_copy_threshold = | |
529 | parameters->get( | |
530 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
531 | ); | |
532 | ||
533 | // select copy method based on thresholds & input_size_bytes | |
534 | size_t count = iterator_range_size(first, last); | |
535 | size_t input_size_bytes = count * sizeof(input_type); | |
536 | ||
537 | // [0; map_copy_threshold) -> copy_to_host_map() | |
538 | // | |
539 | // if direct_copy_threshold is less than map_copy_threshold | |
540 | // copy_to_host_map() is used for every input | |
541 | if(input_size_bytes < map_copy_threshold | |
542 | || direct_copy_threshold <= map_copy_threshold) { | |
543 | return copy_to_host_map(first, last, result, queue); | |
544 | } | |
545 | // [map_copy_threshold; inf) -> copy [first;last) to temporary vector | |
546 | // then copy (and convert) to result using std::copy() | |
547 | std::vector<input_type> vector(count); | |
548 | copy_to_host(first, last, vector.begin(), queue); | |
549 | return std::copy(vector.begin(), vector.end(), result); | |
550 | } | |
551 | ||
552 | // device -> host | |
553 | // Type mismatch between InputIterator and OutputIterator value_types | |
554 | // OutputIterator is a contiguous iterator | |
555 | // value_type of OutputIterator is NOT a boolean type | |
556 | template<class InputIterator, class OutputIterator> | |
557 | inline OutputIterator | |
558 | dispatch_copy(InputIterator first, | |
559 | InputIterator last, | |
560 | OutputIterator result, | |
561 | command_queue &queue, | |
562 | typename boost::enable_if< | |
563 | mpl::and_< | |
564 | is_device_iterator<InputIterator>, | |
565 | mpl::not_< | |
566 | is_device_iterator<OutputIterator> | |
567 | >, | |
568 | mpl::not_< | |
569 | is_same_value_type<OutputIterator, InputIterator> | |
570 | >, | |
571 | is_contiguous_iterator<OutputIterator>, | |
572 | mpl::not_< | |
573 | is_bool_value_type<OutputIterator> | |
574 | > | |
575 | > | |
576 | >::type* = 0) | |
577 | { | |
578 | typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | |
579 | typedef typename InputIterator::value_type input_type; | |
580 | ||
581 | const device &device = queue.get_device(); | |
582 | ||
583 | // loading parameters | |
584 | std::string cache_key = | |
585 | std::string("__boost_compute_copy_to_host_") | |
586 | + type_name<input_type>() + "_" + type_name<output_type>(); | |
587 | boost::shared_ptr<parameter_cache> parameters = | |
588 | detail::parameter_cache::get_global_cache(device); | |
589 | ||
590 | size_t map_copy_threshold; | |
591 | size_t direct_copy_threshold; | |
592 | ||
593 | // calculate default values of thresholds | |
594 | if (device.type() & device::gpu) { | |
595 | // GPUs | |
596 | map_copy_threshold = 524288; // 0.5 MB | |
597 | direct_copy_threshold = 52428800; // 50 MB | |
598 | } | |
599 | else { | |
600 | // CPUs and other devices | |
601 | map_copy_threshold = 134217728; // 128 MB | |
602 | direct_copy_threshold = 0; // it's never efficient for CPUs | |
603 | } | |
604 | ||
605 | // load thresholds | |
606 | map_copy_threshold = | |
607 | parameters->get( | |
608 | cache_key, "map_copy_threshold", map_copy_threshold | |
609 | ); | |
610 | direct_copy_threshold = | |
611 | parameters->get( | |
612 | cache_key, "direct_copy_threshold", direct_copy_threshold | |
613 | ); | |
614 | ||
615 | // select copy method based on thresholds & input_size_bytes | |
616 | size_t count = iterator_range_size(first, last); | |
617 | size_t input_size_bytes = count * sizeof(input_type); | |
618 | ||
619 | // [0; map_copy_threshold) -> copy_to_host_map() | |
620 | if(input_size_bytes < map_copy_threshold) { | |
621 | return copy_to_host_map(first, last, result, queue); | |
622 | } | |
623 | // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to | |
624 | // temporary vector then copy (and convert) to result using std::copy() | |
625 | else if(input_size_bytes < direct_copy_threshold) { | |
626 | std::vector<input_type> vector(count); | |
627 | copy_to_host(first, last, vector.begin(), queue); | |
628 | return std::copy(vector.begin(), vector.end(), result); | |
629 | } | |
630 | ||
631 | // [direct_copy_threshold; inf) -> map [result; result + input_size) to | |
632 | // device and run copy kernel on device for copying & casting | |
633 | // map host memory to device. | |
634 | ||
635 | // Perform async copy to host, wait for it to be finished and | |
636 | // return the result. | |
637 | // At this point we are sure that count > 1 (first != last), so event | |
638 | // returned by dispatch_copy_async() must be valid. | |
639 | return dispatch_copy_async(first, last, result, queue).get(); | |
640 | } | |
641 | ||
642 | // device -> device | |
643 | template<class InputIterator, class OutputIterator> | |
644 | inline OutputIterator | |
645 | dispatch_copy(InputIterator first, | |
646 | InputIterator last, | |
647 | OutputIterator result, | |
648 | command_queue &queue, | |
649 | typename boost::enable_if< | |
650 | mpl::and_< | |
651 | is_device_iterator<InputIterator>, | |
652 | is_device_iterator<OutputIterator>, | |
653 | mpl::not_< | |
654 | can_copy_with_copy_buffer< | |
655 | InputIterator, OutputIterator | |
656 | > | |
657 | > | |
658 | > | |
659 | >::type* = 0) | |
660 | { | |
661 | return copy_on_device(first, last, result, queue); | |
662 | } | |
663 | ||
664 | // device -> device (specialization for buffer iterators) | |
665 | template<class InputIterator, class OutputIterator> | |
666 | inline OutputIterator | |
667 | dispatch_copy(InputIterator first, | |
668 | InputIterator last, | |
669 | OutputIterator result, | |
670 | command_queue &queue, | |
671 | typename boost::enable_if< | |
672 | mpl::and_< | |
673 | is_device_iterator<InputIterator>, | |
674 | is_device_iterator<OutputIterator>, | |
675 | can_copy_with_copy_buffer< | |
676 | InputIterator, OutputIterator | |
677 | > | |
678 | > | |
679 | >::type* = 0) | |
680 | { | |
681 | typedef typename std::iterator_traits<InputIterator>::value_type value_type; | |
682 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
683 | ||
684 | difference_type n = std::distance(first, last); | |
685 | if(n < 1){ | |
686 | // nothing to copy | |
687 | return result; | |
688 | } | |
689 | ||
690 | queue.enqueue_copy_buffer(first.get_buffer(), | |
691 | result.get_buffer(), | |
692 | first.get_index() * sizeof(value_type), | |
693 | result.get_index() * sizeof(value_type), | |
694 | static_cast<size_t>(n) * sizeof(value_type)); | |
695 | return result + n; | |
696 | } | |
697 | ||
698 | // device -> device (async) | |
699 | template<class InputIterator, class OutputIterator> | |
700 | inline future<OutputIterator> | |
701 | dispatch_copy_async(InputIterator first, | |
702 | InputIterator last, | |
703 | OutputIterator result, | |
704 | command_queue &queue, | |
705 | typename boost::enable_if< | |
706 | mpl::and_< | |
707 | is_device_iterator<InputIterator>, | |
708 | is_device_iterator<OutputIterator>, | |
709 | mpl::not_< | |
710 | can_copy_with_copy_buffer< | |
711 | InputIterator, OutputIterator | |
712 | > | |
713 | > | |
714 | > | |
715 | >::type* = 0) | |
716 | { | |
717 | return copy_on_device_async(first, last, result, queue); | |
718 | } | |
719 | ||
720 | // device -> device (async, specialization for buffer iterators) | |
721 | template<class InputIterator, class OutputIterator> | |
722 | inline future<OutputIterator> | |
723 | dispatch_copy_async(InputIterator first, | |
724 | InputIterator last, | |
725 | OutputIterator result, | |
726 | command_queue &queue, | |
727 | typename boost::enable_if< | |
728 | mpl::and_< | |
729 | is_device_iterator<InputIterator>, | |
730 | is_device_iterator<OutputIterator>, | |
731 | can_copy_with_copy_buffer< | |
732 | InputIterator, OutputIterator | |
733 | > | |
734 | > | |
735 | >::type* = 0) | |
736 | { | |
737 | typedef typename std::iterator_traits<InputIterator>::value_type value_type; | |
738 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
739 | ||
740 | difference_type n = std::distance(first, last); | |
741 | if(n < 1){ | |
742 | // nothing to copy | |
743 | return make_future(result, event()); | |
744 | } | |
745 | ||
746 | event event_ = | |
747 | queue.enqueue_copy_buffer( | |
748 | first.get_buffer(), | |
749 | result.get_buffer(), | |
750 | first.get_index() * sizeof(value_type), | |
751 | result.get_index() * sizeof(value_type), | |
752 | static_cast<size_t>(n) * sizeof(value_type) | |
753 | ); | |
754 | ||
755 | return make_future(result + n, event_); | |
756 | } | |
757 | ||
758 | // host -> host | |
759 | template<class InputIterator, class OutputIterator> | |
760 | inline OutputIterator | |
761 | dispatch_copy(InputIterator first, | |
762 | InputIterator last, | |
763 | OutputIterator result, | |
764 | command_queue &queue, | |
765 | typename boost::enable_if_c< | |
766 | !is_device_iterator<InputIterator>::value && | |
767 | !is_device_iterator<OutputIterator>::value | |
768 | >::type* = 0) | |
769 | { | |
770 | (void) queue; | |
771 | ||
772 | return std::copy(first, last, result); | |
773 | } | |
774 | ||
775 | } // end detail namespace | |
776 | ||
777 | /// Copies the values in the range [\p first, \p last) to the range | |
778 | /// beginning at \p result. | |
779 | /// | |
780 | /// The generic copy() function can be used for a variety of data | |
781 | /// transfer tasks and provides a standard interface to the following | |
782 | /// OpenCL functions: | |
783 | /// | |
784 | /// \li \c clEnqueueReadBuffer() | |
785 | /// \li \c clEnqueueWriteBuffer() | |
786 | /// \li \c clEnqueueCopyBuffer() | |
787 | /// | |
788 | /// Unlike the aforementioned OpenCL functions, copy() will also work | |
789 | /// with non-contiguous data-structures (e.g. \c std::list<T>) as | |
790 | /// well as with "fancy" iterators (e.g. transform_iterator). | |
791 | /// | |
792 | /// \param first first element in the range to copy | |
793 | /// \param last last element in the range to copy | |
794 | /// \param result first element in the result range | |
795 | /// \param queue command queue to perform the operation | |
796 | /// | |
797 | /// \return \c OutputIterator to the end of the result range | |
798 | /// | |
799 | /// For example, to copy an array of \c int values on the host to a vector on | |
800 | /// the device: | |
801 | /// \code | |
802 | /// // array on the host | |
803 | /// int data[] = { 1, 2, 3, 4 }; | |
804 | /// | |
805 | /// // vector on the device | |
806 | /// boost::compute::vector<int> vec(4, context); | |
807 | /// | |
808 | /// // copy values to the device vector | |
809 | /// boost::compute::copy(data, data + 4, vec.begin(), queue); | |
810 | /// \endcode | |
811 | /// | |
812 | /// The copy algorithm can also be used with standard containers such as | |
813 | /// \c std::vector<T>: | |
814 | /// \code | |
815 | /// std::vector<int> host_vector = ... | |
816 | /// boost::compute::vector<int> device_vector = ... | |
817 | /// | |
818 | /// // copy from the host to the device | |
819 | /// boost::compute::copy( | |
820 | /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue | |
821 | /// ); | |
822 | /// | |
823 | /// // copy from the device to the host | |
824 | /// boost::compute::copy( | |
825 | /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue | |
826 | /// ); | |
827 | /// \endcode | |
828 | /// | |
829 | /// \see copy_n(), copy_if(), copy_async() | |
830 | template<class InputIterator, class OutputIterator> | |
831 | inline OutputIterator copy(InputIterator first, | |
832 | InputIterator last, | |
833 | OutputIterator result, | |
834 | command_queue &queue = system::default_queue()) | |
835 | { | |
836 | return detail::dispatch_copy(first, last, result, queue); | |
837 | } | |
838 | ||
839 | /// Copies the values in the range [\p first, \p last) to the range | |
840 | /// beginning at \p result. The copy is performed asynchronously. | |
841 | /// | |
842 | /// \see copy() | |
843 | template<class InputIterator, class OutputIterator> | |
844 | inline future<OutputIterator> | |
845 | copy_async(InputIterator first, | |
846 | InputIterator last, | |
847 | OutputIterator result, | |
848 | command_queue &queue = system::default_queue()) | |
849 | { | |
850 | return detail::dispatch_copy_async(first, last, result, queue); | |
851 | } | |
852 | ||
853 | } // end compute namespace | |
854 | } // end boost namespace | |
855 | ||
856 | #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP |