]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP | |
13 | ||
14 | #include <boost/compute/kernel.hpp> | |
15 | #include <boost/compute/program.hpp> | |
16 | #include <boost/compute/command_queue.hpp> | |
17 | #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp> | |
18 | #include <boost/compute/container/vector.hpp> | |
19 | #include <boost/compute/detail/meta_kernel.hpp> | |
20 | #include <boost/compute/detail/iterator_range_size.hpp> | |
21 | ||
22 | namespace boost { | |
23 | namespace compute { | |
24 | namespace detail { | |
25 | ||
26 | template<class KeyIterator, class ValueIterator, class Compare> | |
27 | inline void merge_blocks(KeyIterator keys_first, | |
28 | ValueIterator values_first, | |
29 | KeyIterator keys_result, | |
30 | ValueIterator values_result, | |
31 | Compare compare, | |
32 | size_t count, | |
33 | const size_t block_size, | |
34 | const bool sort_by_key, | |
35 | command_queue &queue) | |
36 | { | |
37 | (void) values_result; | |
38 | (void) values_first; | |
39 | ||
40 | meta_kernel k("merge_sort_on_cpu_merge_blocks"); | |
41 | size_t count_arg = k.add_arg<const uint_>("count"); | |
42 | size_t block_size_arg = k.add_arg<uint_>("block_size"); | |
43 | ||
44 | k << | |
45 | k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" << | |
46 | k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" << | |
47 | k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" << | |
48 | k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" << | |
49 | k.decl<uint_>("result_idx") << " = b1_start;\n" << | |
50 | ||
51 | // merging block 1 and block 2 (stable) | |
52 | "while(b1_start < b1_end && b2_start < b2_end){\n" << | |
53 | " if( " << compare(keys_first[k.var<uint_>("b2_start")], | |
54 | keys_first[k.var<uint_>("b1_start")]) << "){\n" << | |
55 | " " << keys_result[k.var<uint_>("result_idx")] << " = " << | |
56 | keys_first[k.var<uint_>("b2_start")] << ";\n"; | |
57 | if(sort_by_key){ | |
58 | k << | |
59 | " " << values_result[k.var<uint_>("result_idx")] << " = " << | |
60 | values_first[k.var<uint_>("b2_start")] << ";\n"; | |
61 | } | |
62 | k << | |
63 | " b2_start++;\n" << | |
64 | " }\n" << | |
65 | " else {\n" << | |
66 | " " << keys_result[k.var<uint_>("result_idx")] << " = " << | |
67 | keys_first[k.var<uint_>("b1_start")] << ";\n"; | |
68 | if(sort_by_key){ | |
69 | k << | |
70 | " " << values_result[k.var<uint_>("result_idx")] << " = " << | |
71 | values_first[k.var<uint_>("b1_start")] << ";\n"; | |
72 | } | |
73 | k << | |
74 | " b1_start++;\n" << | |
75 | " }\n" << | |
76 | " result_idx++;\n" << | |
77 | "}\n" << | |
78 | "while(b1_start < b1_end){\n" << | |
79 | " " << keys_result[k.var<uint_>("result_idx")] << " = " << | |
80 | keys_first[k.var<uint_>("b1_start")] << ";\n"; | |
81 | if(sort_by_key){ | |
82 | k << | |
83 | " " << values_result[k.var<uint_>("result_idx")] << " = " << | |
84 | values_first[k.var<uint_>("b1_start")] << ";\n"; | |
85 | } | |
86 | k << | |
87 | " b1_start++;\n" << | |
88 | " result_idx++;\n" << | |
89 | "}\n" << | |
90 | "while(b2_start < b2_end){\n" << | |
91 | " " << keys_result[k.var<uint_>("result_idx")] << " = " << | |
92 | keys_first[k.var<uint_>("b2_start")] << ";\n"; | |
93 | if(sort_by_key){ | |
94 | k << | |
95 | " " << values_result[k.var<uint_>("result_idx")] << " = " << | |
96 | values_first[k.var<uint_>("b2_start")] << ";\n"; | |
97 | } | |
98 | k << | |
99 | " b2_start++;\n" << | |
100 | " result_idx++;\n" << | |
101 | "}\n"; | |
102 | ||
103 | const context &context = queue.get_context(); | |
104 | ::boost::compute::kernel kernel = k.compile(context); | |
105 | kernel.set_arg(count_arg, static_cast<const uint_>(count)); | |
106 | kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); | |
107 | ||
108 | const size_t global_size = static_cast<size_t>( | |
109 | std::ceil(float(count) / (2 * block_size)) | |
110 | ); | |
111 | queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); | |
112 | } | |
113 | ||
114 | template<class Iterator, class Compare> | |
115 | inline void merge_blocks(Iterator first, | |
116 | Iterator result, | |
117 | Compare compare, | |
118 | size_t count, | |
119 | const size_t block_size, | |
120 | const bool sort_by_key, | |
121 | command_queue &queue) | |
122 | { | |
123 | // dummy iterator as it's not sort by key | |
124 | Iterator dummy; | |
125 | merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue); | |
126 | } | |
127 | ||
128 | template<class Iterator, class Compare> | |
129 | inline void dispatch_merge_blocks(Iterator first, | |
130 | Iterator result, | |
131 | Compare compare, | |
132 | size_t count, | |
133 | const size_t block_size, | |
134 | const size_t input_size_threshold, | |
135 | const size_t blocks_no_threshold, | |
136 | command_queue &queue) | |
137 | { | |
138 | const size_t blocks_no = static_cast<size_t>( | |
139 | std::ceil(float(count) / block_size) | |
140 | ); | |
141 | // merge with merge path should used only for the large arrays and at the | |
142 | // end of merging part when there are only a few big blocks left to be merged | |
143 | if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ | |
144 | Iterator last = first + count; | |
145 | for(size_t i = 0; i < count; i+= 2*block_size) | |
146 | { | |
147 | Iterator first1 = (std::min)(first + i, last); | |
148 | Iterator last1 = (std::min)(first1 + block_size, last); | |
149 | Iterator first2 = last1; | |
150 | Iterator last2 = (std::min)(first2 + block_size, last); | |
151 | Iterator block_result = (std::min)(result + i, result + count); | |
152 | merge_with_merge_path(first1, last1, first2, last2, | |
153 | block_result, compare, queue); | |
154 | } | |
155 | } | |
156 | else { | |
157 | merge_blocks(first, result, compare, count, block_size, false, queue); | |
158 | } | |
159 | } | |
160 | ||
161 | template<class KeyIterator, class ValueIterator, class Compare> | |
162 | inline void block_insertion_sort(KeyIterator keys_first, | |
163 | ValueIterator values_first, | |
164 | Compare compare, | |
165 | const size_t count, | |
166 | const size_t block_size, | |
167 | const bool sort_by_key, | |
168 | command_queue &queue) | |
169 | { | |
170 | (void) values_first; | |
171 | ||
172 | typedef typename std::iterator_traits<KeyIterator>::value_type K; | |
173 | typedef typename std::iterator_traits<ValueIterator>::value_type T; | |
174 | ||
175 | meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); | |
176 | size_t count_arg = k.add_arg<uint_>("count"); | |
177 | size_t block_size_arg = k.add_arg<uint_>("block_size"); | |
178 | ||
179 | k << | |
180 | k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" << | |
181 | k.decl<uint_>("end") << " = min(count, start + block_size);\n" << | |
182 | ||
183 | // block insertion sort (stable) | |
184 | "for(uint i = start+1; i < end; i++){\n" << | |
185 | " " << k.decl<const K>("key") << " = " << | |
186 | keys_first[k.var<uint_>("i")] << ";\n"; | |
187 | if(sort_by_key){ | |
188 | k << | |
189 | " " << k.decl<const T>("value") << " = " << | |
190 | values_first[k.var<uint_>("i")] << ";\n"; | |
191 | } | |
192 | k << | |
193 | " uint pos = i;\n" << | |
194 | " while(pos > start && " << | |
195 | compare(k.var<const K>("key"), | |
196 | keys_first[k.var<uint_>("pos-1")]) << "){\n" << | |
197 | " " << keys_first[k.var<uint_>("pos")] << " = " << | |
198 | keys_first[k.var<uint_>("pos-1")] << ";\n"; | |
199 | if(sort_by_key){ | |
200 | k << | |
201 | " " << values_first[k.var<uint_>("pos")] << " = " << | |
202 | values_first[k.var<uint_>("pos-1")] << ";\n"; | |
203 | } | |
204 | k << | |
205 | " pos--;\n" << | |
206 | " }\n" << | |
207 | " " << keys_first[k.var<uint_>("pos")] << " = key;\n"; | |
208 | if(sort_by_key) { | |
209 | k << | |
210 | " " << values_first[k.var<uint_>("pos")] << " = value;\n"; | |
211 | } | |
212 | k << | |
213 | "}\n"; // block insertion sort | |
214 | ||
215 | const context &context = queue.get_context(); | |
216 | ::boost::compute::kernel kernel = k.compile(context); | |
217 | kernel.set_arg(count_arg, static_cast<uint_>(count)); | |
218 | kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); | |
219 | ||
220 | const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size)); | |
221 | queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); | |
222 | } | |
223 | ||
224 | template<class Iterator, class Compare> | |
225 | inline void block_insertion_sort(Iterator first, | |
226 | Compare compare, | |
227 | const size_t count, | |
228 | const size_t block_size, | |
229 | command_queue &queue) | |
230 | { | |
231 | // dummy iterator as it's not sort by key | |
232 | Iterator dummy; | |
233 | block_insertion_sort(first, dummy, compare, count, block_size, false, queue); | |
234 | } | |
235 | ||
236 | // This sort is stable. | |
237 | template<class Iterator, class Compare> | |
238 | inline void merge_sort_on_cpu(Iterator first, | |
239 | Iterator last, | |
240 | Compare compare, | |
241 | command_queue &queue) | |
242 | { | |
243 | typedef typename std::iterator_traits<Iterator>::value_type value_type; | |
244 | ||
245 | size_t count = iterator_range_size(first, last); | |
246 | if(count < 2){ | |
247 | return; | |
248 | } | |
249 | // for small input size only insertion sort is performed | |
250 | else if(count <= 512){ | |
251 | block_insertion_sort(first, compare, count, count, queue); | |
252 | return; | |
253 | } | |
254 | ||
255 | const context &context = queue.get_context(); | |
256 | const device &device = queue.get_device(); | |
257 | ||
258 | // loading parameters | |
259 | std::string cache_key = | |
260 | std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>(); | |
261 | boost::shared_ptr<parameter_cache> parameters = | |
262 | detail::parameter_cache::get_global_cache(device); | |
263 | ||
264 | // When there is merge_with_path_blocks_no_threshold or less blocks left to | |
265 | // merge AND input size is merge_with_merge_path_input_size_threshold or more | |
266 | // merge_with_merge_path() algorithm is used to merge sorted blocks; | |
267 | // otherwise merge_blocks() is used. | |
268 | const size_t merge_with_path_blocks_no_threshold = | |
269 | parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); | |
270 | const size_t merge_with_path_input_size_threshold = | |
271 | parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); | |
272 | ||
273 | const size_t block_size = | |
274 | parameters->get(cache_key, "insertion_sort_block_size", 64); | |
275 | block_insertion_sort(first, compare, count, block_size, queue); | |
276 | ||
277 | // temporary buffer for merge result | |
278 | vector<value_type> temp(count, context); | |
279 | bool result_in_temporary_buffer = false; | |
280 | ||
281 | for(size_t i = block_size; i < count; i *= 2){ | |
282 | result_in_temporary_buffer = !result_in_temporary_buffer; | |
283 | if(result_in_temporary_buffer) { | |
284 | dispatch_merge_blocks(first, temp.begin(), compare, count, i, | |
285 | merge_with_path_input_size_threshold, | |
286 | merge_with_path_blocks_no_threshold, | |
287 | queue); | |
288 | } else { | |
289 | dispatch_merge_blocks(temp.begin(), first, compare, count, i, | |
290 | merge_with_path_input_size_threshold, | |
291 | merge_with_path_blocks_no_threshold, | |
292 | queue); | |
293 | } | |
294 | } | |
295 | ||
296 | if(result_in_temporary_buffer) { | |
297 | copy(temp.begin(), temp.end(), first, queue); | |
298 | } | |
299 | } | |
300 | ||
301 | // This sort is stable. | |
302 | template<class KeyIterator, class ValueIterator, class Compare> | |
303 | inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, | |
304 | KeyIterator keys_last, | |
305 | ValueIterator values_first, | |
306 | Compare compare, | |
307 | command_queue &queue) | |
308 | { | |
309 | typedef typename std::iterator_traits<KeyIterator>::value_type key_type; | |
310 | typedef typename std::iterator_traits<ValueIterator>::value_type value_type; | |
311 | ||
312 | size_t count = iterator_range_size(keys_first, keys_last); | |
313 | if(count < 2){ | |
314 | return; | |
315 | } | |
316 | // for small input size only insertion sort is performed | |
317 | else if(count <= 512){ | |
318 | block_insertion_sort(keys_first, values_first, compare, | |
319 | count, count, true, queue); | |
320 | return; | |
321 | } | |
322 | ||
323 | const context &context = queue.get_context(); | |
324 | const device &device = queue.get_device(); | |
325 | ||
326 | // loading parameters | |
327 | std::string cache_key = | |
328 | std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() | |
329 | + "_with_" + type_name<key_type>(); | |
330 | boost::shared_ptr<parameter_cache> parameters = | |
331 | detail::parameter_cache::get_global_cache(device); | |
332 | ||
333 | const size_t block_size = | |
334 | parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); | |
335 | block_insertion_sort(keys_first, values_first, compare, | |
336 | count, block_size, true, queue); | |
337 | ||
338 | // temporary buffer for merge results | |
339 | vector<value_type> values_temp(count, context); | |
340 | vector<key_type> keys_temp(count, context); | |
341 | bool result_in_temporary_buffer = false; | |
342 | ||
343 | for(size_t i = block_size; i < count; i *= 2){ | |
344 | result_in_temporary_buffer = !result_in_temporary_buffer; | |
345 | if(result_in_temporary_buffer) { | |
346 | merge_blocks(keys_first, values_first, | |
347 | keys_temp.begin(), values_temp.begin(), | |
348 | compare, count, i, true, queue); | |
349 | } else { | |
350 | merge_blocks(keys_temp.begin(), values_temp.begin(), | |
351 | keys_first, values_first, | |
352 | compare, count, i, true, queue); | |
353 | } | |
354 | } | |
355 | ||
356 | if(result_in_temporary_buffer) { | |
357 | copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); | |
358 | copy(values_temp.begin(), values_temp.end(), values_first, queue); | |
359 | } | |
360 | } | |
361 | ||
362 | } // end detail namespace | |
363 | } // end compute namespace | |
364 | } // end boost namespace | |
365 | ||
366 | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP |