]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/scan_on_cpu.hpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / scan_on_cpu.hpp
CommitLineData
7c673cae
FG
1//---------------------------------------------------------------------------//
2// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
3//
4// Distributed under the Boost Software License, Version 1.0
5// See accompanying file LICENSE_1_0.txt or copy at
6// http://www.boost.org/LICENSE_1_0.txt
7//
8// See http://boostorg.github.com/compute for more information.
9//---------------------------------------------------------------------------//
10
11#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
12#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
13
14#include <iterator>
15
16#include <boost/compute/device.hpp>
17#include <boost/compute/kernel.hpp>
18#include <boost/compute/command_queue.hpp>
19#include <boost/compute/algorithm/detail/serial_scan.hpp>
20#include <boost/compute/detail/meta_kernel.hpp>
21#include <boost/compute/detail/iterator_range_size.hpp>
22#include <boost/compute/detail/parameter_cache.hpp>
23
24namespace boost {
25namespace compute {
26namespace detail {
27
28template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
29inline OutputIterator scan_on_cpu(InputIterator first,
30 InputIterator last,
31 OutputIterator result,
32 bool exclusive,
33 T init,
34 BinaryOperator op,
35 command_queue &queue)
36{
37 typedef typename
38 std::iterator_traits<InputIterator>::value_type input_type;
39 typedef typename
40 std::iterator_traits<OutputIterator>::value_type output_type;
41
42 const context &context = queue.get_context();
43 const device &device = queue.get_device();
44 const size_t compute_units = queue.get_device().compute_units();
45
46 boost::shared_ptr<parameter_cache> parameters =
47 detail::parameter_cache::get_global_cache(device);
48
49 std::string cache_key =
50 "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
51
52 // for inputs smaller than serial_scan_threshold
53 // serial_scan algorithm is used
54 uint_ serial_scan_threshold =
55 parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
56 serial_scan_threshold =
57 (std::max)(serial_scan_threshold, uint_(compute_units));
58
59 size_t count = detail::iterator_range_size(first, last);
60 if(count == 0){
61 return result;
62 }
63 else if(count < serial_scan_threshold) {
64 return serial_scan(first, last, result, exclusive, init, op, queue);
65 }
66
67 buffer block_partial_sums(context, sizeof(output_type) * compute_units );
68
69 // create scan kernel
70 meta_kernel k("scan_on_cpu_block_scan");
71
72 // Arguments
73 size_t count_arg = k.add_arg<uint_>("count");
74 size_t init_arg = k.add_arg<output_type>("initial_value");
75 size_t block_partial_sums_arg =
76 k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
77
78 k <<
79 "uint block = " <<
80 "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
81 "uint index = get_global_id(0) * block;\n" <<
82 "uint end = min(count, index + block);\n";
83
84 if(!exclusive){
85 k <<
86 k.decl<output_type>("sum") << " = " <<
87 first[k.var<uint_>("index")] << ";\n" <<
88 result[k.var<uint_>("index")] << " = sum;\n" <<
89 "index++;\n";
90 }
91 else {
92 k <<
93 k.decl<output_type>("sum") << ";\n" <<
94 "if(index == 0){\n" <<
95 "sum = initial_value;\n" <<
96 "}\n" <<
97 "else {\n" <<
98 "sum = " << first[k.var<uint_>("index")] << ";\n" <<
99 "index++;\n" <<
100 "}\n";
101 }
102
103 k <<
104 "while(index < end){\n" <<
105 // load next value
106 k.decl<const input_type>("value") << " = "
107 << first[k.var<uint_>("index")] << ";\n";
108
109 if(exclusive){
110 k <<
111 "if(get_global_id(0) == 0){\n" <<
112 result[k.var<uint_>("index")] << " = sum;\n" <<
113 "}\n";
114 }
115 k <<
116 "sum = " << op(k.var<output_type>("sum"),
117 k.var<output_type>("value")) << ";\n";
118
119 if(!exclusive){
120 k <<
121 "if(get_global_id(0) == 0){\n" <<
122 result[k.var<uint_>("index")] << " = sum;\n" <<
123 "}\n";
124 }
125
126 k <<
127 "index++;\n" <<
128 "}\n" << // end while
129 "block_partial_sums[get_global_id(0)] = sum;\n";
130
131 // compile scan kernel
132 kernel block_scan_kernel = k.compile(context);
133
134 // setup kernel arguments
135 block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
136 block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
137 block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
138
139 // execute the kernel
140 size_t global_work_size = compute_units;
141 queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
142
143 // scan is done
144 if(compute_units < 2) {
145 return result + count;
146 }
147
148 // final scan kernel
149 meta_kernel l("scan_on_cpu_final_scan");
150
151 // Arguments
152 count_arg = l.add_arg<uint_>("count");
153 block_partial_sums_arg =
154 l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
155
156 l <<
157 "uint block = " <<
158 "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
159 "uint index = block + get_global_id(0) * block;\n" <<
160 "uint end = min(count, index + block);\n" <<
161
162 k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
163 "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
164 "sum = " << op(k.var<output_type>("sum"),
165 k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
166 "}\n" <<
167
168 "while(index < end){\n";
169 if(exclusive){
170 l <<
171 l.decl<output_type>("value") << " = "
172 << first[k.var<uint_>("index")] << ";\n" <<
173 result[k.var<uint_>("index")] << " = sum;\n" <<
174 "sum = " << op(k.var<output_type>("sum"),
175 k.var<output_type>("value")) << ";\n";
176 }
177 else {
178 l <<
179 "sum = " << op(k.var<output_type>("sum"),
180 first[k.var<uint_>("index")]) << ";\n" <<
181 result[k.var<uint_>("index")] << " = sum;\n";
182 }
183 l <<
184 "index++;\n" <<
185 "}\n";
186
187
188 // compile scan kernel
189 kernel final_scan_kernel = l.compile(context);
190
191 // setup kernel arguments
192 final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
193 final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
194
195 // execute the kernel
196 global_work_size = compute_units;
197 queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
198
199 // return iterator pointing to the end of the result range
200 return result + count;
201}
202
203} // end detail namespace
204} // end compute namespace
205} // end boost namespace
206
207#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP