]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/boost/compute/algorithm/detail/inplace_reduce.hpp
update sources to v12.2.3
[ceph.git] / ceph / src / boost / boost / compute / algorithm / detail / inplace_reduce.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
13
14 #include <iterator>
15
16 #include <boost/utility/result_of.hpp>
17
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/container/vector.hpp>
20 #include <boost/compute/detail/iterator_range_size.hpp>
21 #include <boost/compute/memory/local_buffer.hpp>
22
23 namespace boost {
24 namespace compute {
25 namespace detail {
26
27 template<class Iterator, class BinaryFunction>
28 inline void inplace_reduce(Iterator first,
29 Iterator last,
30 BinaryFunction function,
31 command_queue &queue)
32 {
33 typedef typename
34 std::iterator_traits<Iterator>::value_type
35 value_type;
36
37 size_t input_size = iterator_range_size(first, last);
38 if(input_size < 2){
39 return;
40 }
41
42 const context &context = queue.get_context();
43
44 size_t block_size = 64;
45 size_t values_per_thread = 8;
46 size_t block_count = input_size / (block_size * values_per_thread);
47 if(block_count * block_size * values_per_thread != input_size)
48 block_count++;
49
50 vector<value_type> output(block_count, context);
51
52 meta_kernel k("inplace_reduce");
53 size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
54 size_t input_size_arg = k.add_arg<const uint_>("input_size");
55 size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
56 size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
57 k <<
58 "const uint gid = get_global_id(0);\n" <<
59 "const uint lid = get_local_id(0);\n" <<
60 "const uint values_per_thread =\n"
61 << uint_(values_per_thread) << ";\n" <<
62
63 // thread reduce
64 "const uint index = gid * values_per_thread;\n" <<
65 "if(index < input_size){\n" <<
66 k.decl<value_type>("sum") << " = input[index];\n" <<
67 "for(uint i = 1;\n" <<
68 "i < values_per_thread && (index + i) < input_size;\n" <<
69 "i++){\n" <<
70 " sum = " <<
71 function(k.var<value_type>("sum"),
72 k.var<value_type>("input[index+i]")) << ";\n" <<
73 "}\n" <<
74 "scratch[lid] = sum;\n" <<
75 "}\n" <<
76
77 // local reduce
78 "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
79 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
80 " uint mask = (i << 1) - 1;\n" <<
81 " uint next_index = (gid + i) * values_per_thread;\n"
82 " if((lid & mask) == 0 && next_index < input_size){\n" <<
83 " scratch[lid] = " <<
84 function(k.var<value_type>("scratch[lid]"),
85 k.var<value_type>("scratch[lid+i]")) << ";\n" <<
86 " }\n" <<
87 "}\n" <<
88
89 // write output for block
90 "if(lid == 0){\n" <<
91 " output[get_group_id(0)] = scratch[0];\n" <<
92 "}\n"
93 ;
94
95 const buffer *input_buffer = &first.get_buffer();
96 const buffer *output_buffer = &output.get_buffer();
97
98 kernel kernel = k.compile(context);
99
100 while(input_size > 1){
101 kernel.set_arg(input_arg, *input_buffer);
102 kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
103 kernel.set_arg(output_arg, *output_buffer);
104 kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
105
106 queue.enqueue_1d_range_kernel(kernel,
107 0,
108 block_count * block_size,
109 block_size);
110
111 input_size =
112 static_cast<size_t>(
113 std::ceil(float(input_size) / (block_size * values_per_thread)
114 )
115 );
116
117 block_count = input_size / (block_size * values_per_thread);
118 if(block_count * block_size * values_per_thread != input_size)
119 block_count++;
120
121 std::swap(input_buffer, output_buffer);
122 }
123
124 if(input_buffer != &first.get_buffer()){
125 ::boost::compute::copy(output.begin(),
126 output.begin() + 1,
127 first,
128 queue);
129 }
130 }
131
132 } // end detail namespace
133 } // end compute namespace
134 } // end boost namespace
135
136 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP