]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP | |
13 | ||
14 | #include <iterator> | |
15 | ||
16 | #include <boost/utility/result_of.hpp> | |
17 | ||
18 | #include <boost/compute/command_queue.hpp> | |
19 | #include <boost/compute/container/vector.hpp> | |
20 | #include <boost/compute/detail/iterator_range_size.hpp> | |
21 | #include <boost/compute/memory/local_buffer.hpp> | |
22 | ||
23 | namespace boost { | |
24 | namespace compute { | |
25 | namespace detail { | |
26 | ||
27 | template<class Iterator, class BinaryFunction> | |
28 | inline void inplace_reduce(Iterator first, | |
29 | Iterator last, | |
30 | BinaryFunction function, | |
31 | command_queue &queue) | |
32 | { | |
33 | typedef typename | |
34 | std::iterator_traits<Iterator>::value_type | |
35 | value_type; | |
36 | ||
37 | size_t input_size = iterator_range_size(first, last); | |
38 | if(input_size < 2){ | |
39 | return; | |
40 | } | |
41 | ||
42 | const context &context = queue.get_context(); | |
43 | ||
44 | size_t block_size = 64; | |
45 | size_t values_per_thread = 8; | |
46 | size_t block_count = input_size / (block_size * values_per_thread); | |
47 | if(block_count * block_size * values_per_thread != input_size) | |
48 | block_count++; | |
49 | ||
50 | vector<value_type> output(block_count, context); | |
51 | ||
52 | meta_kernel k("inplace_reduce"); | |
53 | size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); | |
54 | size_t input_size_arg = k.add_arg<const uint_>("input_size"); | |
55 | size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); | |
56 | size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); | |
57 | k << | |
58 | "const uint gid = get_global_id(0);\n" << | |
59 | "const uint lid = get_local_id(0);\n" << | |
60 | "const uint values_per_thread =\n" | |
61 | << uint_(values_per_thread) << ";\n" << | |
62 | ||
63 | // thread reduce | |
64 | "const uint index = gid * values_per_thread;\n" << | |
65 | "if(index < input_size){\n" << | |
66 | k.decl<value_type>("sum") << " = input[index];\n" << | |
67 | "for(uint i = 1;\n" << | |
68 | "i < values_per_thread && (index + i) < input_size;\n" << | |
69 | "i++){\n" << | |
70 | " sum = " << | |
71 | function(k.var<value_type>("sum"), | |
72 | k.var<value_type>("input[index+i]")) << ";\n" << | |
73 | "}\n" << | |
74 | "scratch[lid] = sum;\n" << | |
75 | "}\n" << | |
76 | ||
77 | // local reduce | |
78 | "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << | |
79 | " barrier(CLK_LOCAL_MEM_FENCE);\n" << | |
80 | " uint mask = (i << 1) - 1;\n" << | |
81 | " uint next_index = (gid + i) * values_per_thread;\n" | |
82 | " if((lid & mask) == 0 && next_index < input_size){\n" << | |
83 | " scratch[lid] = " << | |
84 | function(k.var<value_type>("scratch[lid]"), | |
85 | k.var<value_type>("scratch[lid+i]")) << ";\n" << | |
86 | " }\n" << | |
87 | "}\n" << | |
88 | ||
89 | // write output for block | |
90 | "if(lid == 0){\n" << | |
91 | " output[get_group_id(0)] = scratch[0];\n" << | |
92 | "}\n" | |
93 | ; | |
94 | ||
95 | const buffer *input_buffer = &first.get_buffer(); | |
96 | const buffer *output_buffer = &output.get_buffer(); | |
97 | ||
98 | kernel kernel = k.compile(context); | |
99 | ||
100 | while(input_size > 1){ | |
101 | kernel.set_arg(input_arg, *input_buffer); | |
102 | kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); | |
103 | kernel.set_arg(output_arg, *output_buffer); | |
104 | kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); | |
105 | ||
106 | queue.enqueue_1d_range_kernel(kernel, | |
107 | 0, | |
108 | block_count * block_size, | |
109 | block_size); | |
110 | ||
111 | input_size = | |
112 | static_cast<size_t>( | |
113 | std::ceil(float(input_size) / (block_size * values_per_thread) | |
114 | ) | |
115 | ); | |
116 | ||
117 | block_count = input_size / (block_size * values_per_thread); | |
118 | if(block_count * block_size * values_per_thread != input_size) | |
119 | block_count++; | |
120 | ||
121 | std::swap(input_buffer, output_buffer); | |
122 | } | |
123 | ||
124 | if(input_buffer != &first.get_buffer()){ | |
125 | ::boost::compute::copy(output.begin(), | |
126 | output.begin() + 1, | |
127 | first, | |
128 | queue); | |
129 | } | |
130 | } | |
131 | ||
132 | } // end detail namespace | |
133 | } // end compute namespace | |
134 | } // end boost namespace | |
135 | ||
136 | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP |