]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / merge_sort_on_cpu.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
13
14 #include <boost/compute/kernel.hpp>
15 #include <boost/compute/program.hpp>
16 #include <boost/compute/command_queue.hpp>
17 #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
18 #include <boost/compute/container/vector.hpp>
19 #include <boost/compute/detail/meta_kernel.hpp>
20 #include <boost/compute/detail/iterator_range_size.hpp>
21
22 namespace boost {
23 namespace compute {
24 namespace detail {
25
26 template<class KeyIterator, class ValueIterator, class Compare>
27 inline void merge_blocks(KeyIterator keys_first,
28 ValueIterator values_first,
29 KeyIterator keys_result,
30 ValueIterator values_result,
31 Compare compare,
32 size_t count,
33 const size_t block_size,
34 const bool sort_by_key,
35 command_queue &queue)
36 {
37 (void) values_result;
38 (void) values_first;
39
40 meta_kernel k("merge_sort_on_cpu_merge_blocks");
41 size_t count_arg = k.add_arg<const uint_>("count");
42 size_t block_size_arg = k.add_arg<uint_>("block_size");
43
44 k <<
45 k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
46 k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
47 k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
48 k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
49 k.decl<uint_>("result_idx") << " = b1_start;\n" <<
50
51 // merging block 1 and block 2 (stable)
52 "while(b1_start < b1_end && b2_start < b2_end){\n" <<
53 " if( " << compare(keys_first[k.var<uint_>("b2_start")],
54 keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
55 " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
56 keys_first[k.var<uint_>("b2_start")] << ";\n";
57 if(sort_by_key){
58 k <<
59 " " << values_result[k.var<uint_>("result_idx")] << " = " <<
60 values_first[k.var<uint_>("b2_start")] << ";\n";
61 }
62 k <<
63 " b2_start++;\n" <<
64 " }\n" <<
65 " else {\n" <<
66 " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
67 keys_first[k.var<uint_>("b1_start")] << ";\n";
68 if(sort_by_key){
69 k <<
70 " " << values_result[k.var<uint_>("result_idx")] << " = " <<
71 values_first[k.var<uint_>("b1_start")] << ";\n";
72 }
73 k <<
74 " b1_start++;\n" <<
75 " }\n" <<
76 " result_idx++;\n" <<
77 "}\n" <<
78 "while(b1_start < b1_end){\n" <<
79 " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
80 keys_first[k.var<uint_>("b1_start")] << ";\n";
81 if(sort_by_key){
82 k <<
83 " " << values_result[k.var<uint_>("result_idx")] << " = " <<
84 values_first[k.var<uint_>("b1_start")] << ";\n";
85 }
86 k <<
87 " b1_start++;\n" <<
88 " result_idx++;\n" <<
89 "}\n" <<
90 "while(b2_start < b2_end){\n" <<
91 " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
92 keys_first[k.var<uint_>("b2_start")] << ";\n";
93 if(sort_by_key){
94 k <<
95 " " << values_result[k.var<uint_>("result_idx")] << " = " <<
96 values_first[k.var<uint_>("b2_start")] << ";\n";
97 }
98 k <<
99 " b2_start++;\n" <<
100 " result_idx++;\n" <<
101 "}\n";
102
103 const context &context = queue.get_context();
104 ::boost::compute::kernel kernel = k.compile(context);
105 kernel.set_arg(count_arg, static_cast<const uint_>(count));
106 kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
107
108 const size_t global_size = static_cast<size_t>(
109 std::ceil(float(count) / (2 * block_size))
110 );
111 queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
112 }
113
114 template<class Iterator, class Compare>
115 inline void merge_blocks(Iterator first,
116 Iterator result,
117 Compare compare,
118 size_t count,
119 const size_t block_size,
120 const bool sort_by_key,
121 command_queue &queue)
122 {
123 // dummy iterator as it's not sort by key
124 Iterator dummy;
125 merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
126 }
127
128 template<class Iterator, class Compare>
129 inline void dispatch_merge_blocks(Iterator first,
130 Iterator result,
131 Compare compare,
132 size_t count,
133 const size_t block_size,
134 const size_t input_size_threshold,
135 const size_t blocks_no_threshold,
136 command_queue &queue)
137 {
138 const size_t blocks_no = static_cast<size_t>(
139 std::ceil(float(count) / block_size)
140 );
141 // merge with merge path should used only for the large arrays and at the
142 // end of merging part when there are only a few big blocks left to be merged
143 if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
144 Iterator last = first + count;
145 for(size_t i = 0; i < count; i+= 2*block_size)
146 {
147 Iterator first1 = (std::min)(first + i, last);
148 Iterator last1 = (std::min)(first1 + block_size, last);
149 Iterator first2 = last1;
150 Iterator last2 = (std::min)(first2 + block_size, last);
151 Iterator block_result = (std::min)(result + i, result + count);
152 merge_with_merge_path(first1, last1, first2, last2,
153 block_result, compare, queue);
154 }
155 }
156 else {
157 merge_blocks(first, result, compare, count, block_size, false, queue);
158 }
159 }
160
161 template<class KeyIterator, class ValueIterator, class Compare>
162 inline void block_insertion_sort(KeyIterator keys_first,
163 ValueIterator values_first,
164 Compare compare,
165 const size_t count,
166 const size_t block_size,
167 const bool sort_by_key,
168 command_queue &queue)
169 {
170 (void) values_first;
171
172 typedef typename std::iterator_traits<KeyIterator>::value_type K;
173 typedef typename std::iterator_traits<ValueIterator>::value_type T;
174
175 meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
176 size_t count_arg = k.add_arg<uint_>("count");
177 size_t block_size_arg = k.add_arg<uint_>("block_size");
178
179 k <<
180 k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
181 k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
182
183 // block insertion sort (stable)
184 "for(uint i = start+1; i < end; i++){\n" <<
185 " " << k.decl<const K>("key") << " = " <<
186 keys_first[k.var<uint_>("i")] << ";\n";
187 if(sort_by_key){
188 k <<
189 " " << k.decl<const T>("value") << " = " <<
190 values_first[k.var<uint_>("i")] << ";\n";
191 }
192 k <<
193 " uint pos = i;\n" <<
194 " while(pos > start && " <<
195 compare(k.var<const K>("key"),
196 keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
197 " " << keys_first[k.var<uint_>("pos")] << " = " <<
198 keys_first[k.var<uint_>("pos-1")] << ";\n";
199 if(sort_by_key){
200 k <<
201 " " << values_first[k.var<uint_>("pos")] << " = " <<
202 values_first[k.var<uint_>("pos-1")] << ";\n";
203 }
204 k <<
205 " pos--;\n" <<
206 " }\n" <<
207 " " << keys_first[k.var<uint_>("pos")] << " = key;\n";
208 if(sort_by_key) {
209 k <<
210 " " << values_first[k.var<uint_>("pos")] << " = value;\n";
211 }
212 k <<
213 "}\n"; // block insertion sort
214
215 const context &context = queue.get_context();
216 ::boost::compute::kernel kernel = k.compile(context);
217 kernel.set_arg(count_arg, static_cast<uint_>(count));
218 kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
219
220 const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
221 queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
222 }
223
224 template<class Iterator, class Compare>
225 inline void block_insertion_sort(Iterator first,
226 Compare compare,
227 const size_t count,
228 const size_t block_size,
229 command_queue &queue)
230 {
231 // dummy iterator as it's not sort by key
232 Iterator dummy;
233 block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
234 }
235
236 // This sort is stable.
237 template<class Iterator, class Compare>
238 inline void merge_sort_on_cpu(Iterator first,
239 Iterator last,
240 Compare compare,
241 command_queue &queue)
242 {
243 typedef typename std::iterator_traits<Iterator>::value_type value_type;
244
245 size_t count = iterator_range_size(first, last);
246 if(count < 2){
247 return;
248 }
249 // for small input size only insertion sort is performed
250 else if(count <= 512){
251 block_insertion_sort(first, compare, count, count, queue);
252 return;
253 }
254
255 const context &context = queue.get_context();
256 const device &device = queue.get_device();
257
258 // loading parameters
259 std::string cache_key =
260 std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
261 boost::shared_ptr<parameter_cache> parameters =
262 detail::parameter_cache::get_global_cache(device);
263
264 // When there is merge_with_path_blocks_no_threshold or less blocks left to
265 // merge AND input size is merge_with_merge_path_input_size_threshold or more
266 // merge_with_merge_path() algorithm is used to merge sorted blocks;
267 // otherwise merge_blocks() is used.
268 const size_t merge_with_path_blocks_no_threshold =
269 parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
270 const size_t merge_with_path_input_size_threshold =
271 parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
272
273 const size_t block_size =
274 parameters->get(cache_key, "insertion_sort_block_size", 64);
275 block_insertion_sort(first, compare, count, block_size, queue);
276
277 // temporary buffer for merge result
278 vector<value_type> temp(count, context);
279 bool result_in_temporary_buffer = false;
280
281 for(size_t i = block_size; i < count; i *= 2){
282 result_in_temporary_buffer = !result_in_temporary_buffer;
283 if(result_in_temporary_buffer) {
284 dispatch_merge_blocks(first, temp.begin(), compare, count, i,
285 merge_with_path_input_size_threshold,
286 merge_with_path_blocks_no_threshold,
287 queue);
288 } else {
289 dispatch_merge_blocks(temp.begin(), first, compare, count, i,
290 merge_with_path_input_size_threshold,
291 merge_with_path_blocks_no_threshold,
292 queue);
293 }
294 }
295
296 if(result_in_temporary_buffer) {
297 copy(temp.begin(), temp.end(), first, queue);
298 }
299 }
300
301 // This sort is stable.
302 template<class KeyIterator, class ValueIterator, class Compare>
303 inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
304 KeyIterator keys_last,
305 ValueIterator values_first,
306 Compare compare,
307 command_queue &queue)
308 {
309 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
310 typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
311
312 size_t count = iterator_range_size(keys_first, keys_last);
313 if(count < 2){
314 return;
315 }
316 // for small input size only insertion sort is performed
317 else if(count <= 512){
318 block_insertion_sort(keys_first, values_first, compare,
319 count, count, true, queue);
320 return;
321 }
322
323 const context &context = queue.get_context();
324 const device &device = queue.get_device();
325
326 // loading parameters
327 std::string cache_key =
328 std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
329 + "_with_" + type_name<key_type>();
330 boost::shared_ptr<parameter_cache> parameters =
331 detail::parameter_cache::get_global_cache(device);
332
333 const size_t block_size =
334 parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
335 block_insertion_sort(keys_first, values_first, compare,
336 count, block_size, true, queue);
337
338 // temporary buffer for merge results
339 vector<value_type> values_temp(count, context);
340 vector<key_type> keys_temp(count, context);
341 bool result_in_temporary_buffer = false;
342
343 for(size_t i = block_size; i < count; i *= 2){
344 result_in_temporary_buffer = !result_in_temporary_buffer;
345 if(result_in_temporary_buffer) {
346 merge_blocks(keys_first, values_first,
347 keys_temp.begin(), values_temp.begin(),
348 compare, count, i, true, queue);
349 } else {
350 merge_blocks(keys_temp.begin(), values_temp.begin(),
351 keys_first, values_first,
352 compare, count, i, true, queue);
353 }
354 }
355
356 if(result_in_temporary_buffer) {
357 copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
358 copy(values_temp.begin(), values_temp.end(), values_first, queue);
359 }
360 }
361
362 } // end detail namespace
363 } // end compute namespace
364 } // end boost namespace
365
366 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP