ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
  13
  14 #include <boost/compute/kernel.hpp>
  15 #include <boost/compute/program.hpp>
  16 #include <boost/compute/command_queue.hpp>
  17 #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
  18 #include <boost/compute/container/vector.hpp>
  19 #include <boost/compute/detail/meta_kernel.hpp>
  20 #include <boost/compute/detail/iterator_range_size.hpp>
  21
  22 namespace boost {
  23 namespace compute {
  24 namespace detail {
  25
  26 template<class KeyIterator, class ValueIterator, class Compare>
  27 inline void merge_blocks(KeyIterator keys_first,
  28                          ValueIterator values_first,
  29                          KeyIterator keys_result,
  30                          ValueIterator values_result,
  31                          Compare compare,
  32                          size_t count,
  33                          const size_t block_size,
  34                          const bool sort_by_key,
  35                          command_queue &queue)
  36 {
  37     (void) values_result;
  38     (void) values_first;
  39
  40     meta_kernel k("merge_sort_on_cpu_merge_blocks");
  41     size_t count_arg = k.add_arg<const uint_>("count");
  42     size_t block_size_arg = k.add_arg<uint_>("block_size");
  43
  44     k <<
  45         k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
  46         k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
  47         k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
  48         k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
  49         k.decl<uint_>("result_idx") << " = b1_start;\n" <<
  50
  51         // merging block 1 and block 2 (stable)
  52         "while(b1_start < b1_end && b2_start < b2_end){\n" <<
  53         "    if( " << compare(keys_first[k.var<uint_>("b2_start")],
  54                               keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
  55         "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
  56                       keys_first[k.var<uint_>("b2_start")] << ";\n";
  57     if(sort_by_key){
  58         k <<
  59         "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
  60                       values_first[k.var<uint_>("b2_start")] << ";\n";
  61     }
  62     k <<
  63         "        b2_start++;\n" <<
  64         "    }\n" <<
  65         "    else {\n" <<
  66         "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
  67                       keys_first[k.var<uint_>("b1_start")] << ";\n";
  68     if(sort_by_key){
  69         k <<
  70         "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
  71                       values_first[k.var<uint_>("b1_start")] << ";\n";
  72     }
  73     k <<
  74         "        b1_start++;\n" <<
  75         "    }\n" <<
  76         "    result_idx++;\n" <<
  77         "}\n" <<
  78         "while(b1_start < b1_end){\n" <<
  79         "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
  80                  keys_first[k.var<uint_>("b1_start")] << ";\n";
  81     if(sort_by_key){
  82         k <<
  83         "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
  84                       values_first[k.var<uint_>("b1_start")] << ";\n";
  85     }
  86     k <<
  87         "    b1_start++;\n" <<
  88         "    result_idx++;\n" <<
  89         "}\n" <<
  90         "while(b2_start < b2_end){\n" <<
  91         "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
  92                  keys_first[k.var<uint_>("b2_start")] << ";\n";
  93     if(sort_by_key){
  94         k <<
  95         "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
  96                       values_first[k.var<uint_>("b2_start")] << ";\n";
  97     }
  98     k <<
  99         "    b2_start++;\n" <<
 100         "    result_idx++;\n" <<
 101         "}\n";
 102
 103     const context &context = queue.get_context();
 104     ::boost::compute::kernel kernel = k.compile(context);
 105     kernel.set_arg(count_arg, static_cast<const uint_>(count));
 106     kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
 107
 108     const size_t global_size = static_cast<size_t>(
 109         std::ceil(float(count) / (2 * block_size))
 110     );
 111     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
 112 }
 113
 114 template<class Iterator, class Compare>
 115 inline void merge_blocks(Iterator first,
 116                          Iterator result,
 117                          Compare compare,
 118                          size_t count,
 119                          const size_t block_size,
 120                          const bool sort_by_key,
 121                          command_queue &queue)
 122 {
 123     // dummy iterator as it's not sort by key
 124     Iterator dummy;
 125     merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
 126 }
 127
 128 template<class Iterator, class Compare>
 129 inline void dispatch_merge_blocks(Iterator first,
 130                                   Iterator result,
 131                                   Compare compare,
 132                                   size_t count,
 133                                   const size_t block_size,
 134                                   const size_t input_size_threshold,
 135                                   const size_t blocks_no_threshold,
 136                                   command_queue &queue)
 137 {
 138     const size_t blocks_no = static_cast<size_t>(
 139         std::ceil(float(count) / block_size)
 140     );
 141     // merge with merge path should used only for the large arrays and at the
 142     // end of merging part when there are only a few big blocks left to be merged
 143     if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
 144         Iterator last = first + count;
 145         for(size_t i = 0; i < count; i+= 2*block_size)
 146         {
 147             Iterator first1 = (std::min)(first + i, last);
 148             Iterator last1 = (std::min)(first1 + block_size, last);
 149             Iterator first2 = last1;
 150             Iterator last2 = (std::min)(first2 + block_size, last);
 151             Iterator block_result = (std::min)(result + i, result + count);
 152             merge_with_merge_path(first1, last1, first2, last2,
 153                                   block_result, compare, queue);
 154         }
 155     }
 156     else {
 157         merge_blocks(first, result, compare, count, block_size, false, queue);
 158     }
 159 }
 160
 161 template<class KeyIterator, class ValueIterator, class Compare>
 162 inline void block_insertion_sort(KeyIterator keys_first,
 163                                  ValueIterator values_first,
 164                                  Compare compare,
 165                                  const size_t count,
 166                                  const size_t block_size,
 167                                  const bool sort_by_key,
 168                                  command_queue &queue)
 169 {
 170     (void) values_first;
 171
 172     typedef typename std::iterator_traits<KeyIterator>::value_type K;
 173     typedef typename std::iterator_traits<ValueIterator>::value_type T;
 174
 175     meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
 176     size_t count_arg = k.add_arg<uint_>("count");
 177     size_t block_size_arg = k.add_arg<uint_>("block_size");
 178
 179     k <<
 180         k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
 181         k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
 182
 183         // block insertion sort (stable)
 184         "for(uint i = start+1; i < end; i++){\n" <<
 185         "    " << k.decl<const K>("key") << " = " <<
 186                   keys_first[k.var<uint_>("i")] << ";\n";
 187     if(sort_by_key){
 188         k <<
 189         "    " << k.decl<const T>("value") << " = " <<
 190                   values_first[k.var<uint_>("i")] << ";\n";
 191     }
 192     k <<
 193         "    uint pos = i;\n" <<
 194         "    while(pos > start && " <<
 195                    compare(k.var<const K>("key"),
 196                            keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
 197         "        " << keys_first[k.var<uint_>("pos")] << " = " <<
 198                       keys_first[k.var<uint_>("pos-1")] << ";\n";
 199     if(sort_by_key){
 200         k <<
 201         "        " << values_first[k.var<uint_>("pos")] << " = " <<
 202                       values_first[k.var<uint_>("pos-1")] << ";\n";
 203     }
 204     k <<
 205         "        pos--;\n" <<
 206         "    }\n" <<
 207         "    " << keys_first[k.var<uint_>("pos")] << " = key;\n";
 208     if(sort_by_key) {
 209         k <<
 210         "    " << values_first[k.var<uint_>("pos")] << " = value;\n";
 211     }
 212     k <<
 213         "}\n"; // block insertion sort
 214
 215     const context &context = queue.get_context();
 216     ::boost::compute::kernel kernel = k.compile(context);
 217     kernel.set_arg(count_arg, static_cast<uint_>(count));
 218     kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
 219
 220     const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
 221     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
 222 }
 223
 224 template<class Iterator, class Compare>
 225 inline void block_insertion_sort(Iterator first,
 226                                  Compare compare,
 227                                  const size_t count,
 228                                  const size_t block_size,
 229                                  command_queue &queue)
 230 {
 231     // dummy iterator as it's not sort by key
 232     Iterator dummy;
 233     block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
 234 }
 235
 236 // This sort is stable.
 237 template<class Iterator, class Compare>
 238 inline void merge_sort_on_cpu(Iterator first,
 239                               Iterator last,
 240                               Compare compare,
 241                               command_queue &queue)
 242 {
 243     typedef typename std::iterator_traits<Iterator>::value_type value_type;
 244
 245     size_t count = iterator_range_size(first, last);
 246     if(count < 2){
 247         return;
 248     }
 249     // for small input size only insertion sort is performed
 250     else if(count <= 512){
 251         block_insertion_sort(first, compare, count, count, queue);
 252         return;
 253     }
 254
 255     const context &context = queue.get_context();
 256     const device &device = queue.get_device();
 257
 258     // loading parameters
 259     std::string cache_key =
 260         std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
 261     boost::shared_ptr<parameter_cache> parameters =
 262         detail::parameter_cache::get_global_cache(device);
 263
 264     // When there is merge_with_path_blocks_no_threshold or less blocks left to
 265     // merge AND input size is merge_with_merge_path_input_size_threshold or more
 266     // merge_with_merge_path() algorithm is used to merge sorted blocks;
 267     // otherwise merge_blocks() is used.
 268     const size_t merge_with_path_blocks_no_threshold =
 269         parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
 270     const size_t merge_with_path_input_size_threshold =
 271         parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
 272
 273     const size_t block_size =
 274         parameters->get(cache_key, "insertion_sort_block_size", 64);
 275     block_insertion_sort(first, compare, count, block_size, queue);
 276
 277     // temporary buffer for merge result
 278     vector<value_type> temp(count, context);
 279     bool result_in_temporary_buffer = false;
 280
 281     for(size_t i = block_size; i < count; i *= 2){
 282         result_in_temporary_buffer = !result_in_temporary_buffer;
 283         if(result_in_temporary_buffer) {
 284             dispatch_merge_blocks(first, temp.begin(), compare, count, i,
 285                                   merge_with_path_input_size_threshold,
 286                                   merge_with_path_blocks_no_threshold,
 287                                   queue);
 288         } else {
 289             dispatch_merge_blocks(temp.begin(), first, compare, count, i,
 290                                   merge_with_path_input_size_threshold,
 291                                   merge_with_path_blocks_no_threshold,
 292                                   queue);
 293         }
 294     }
 295
 296     if(result_in_temporary_buffer) {
 297         copy(temp.begin(), temp.end(), first, queue);
 298     }
 299 }
 300
 301 // This sort is stable.
 302 template<class KeyIterator, class ValueIterator, class Compare>
 303 inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
 304                                      KeyIterator keys_last,
 305                                      ValueIterator values_first,
 306                                      Compare compare,
 307                                      command_queue &queue)
 308 {
 309     typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
 310     typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
 311
 312     size_t count = iterator_range_size(keys_first, keys_last);
 313     if(count < 2){
 314         return;
 315     }
 316     // for small input size only insertion sort is performed
 317     else if(count <= 512){
 318         block_insertion_sort(keys_first, values_first, compare,
 319                              count, count, true, queue);
 320         return;
 321     }
 322
 323     const context &context = queue.get_context();
 324     const device &device = queue.get_device();
 325
 326     // loading parameters
 327     std::string cache_key =
 328         std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
 329         + "_with_" + type_name<key_type>();
 330     boost::shared_ptr<parameter_cache> parameters =
 331         detail::parameter_cache::get_global_cache(device);
 332
 333     const size_t block_size =
 334         parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
 335     block_insertion_sort(keys_first, values_first, compare,
 336                          count, block_size, true, queue);
 337
 338     // temporary buffer for merge results
 339     vector<value_type> values_temp(count, context);
 340     vector<key_type> keys_temp(count, context);
 341     bool result_in_temporary_buffer = false;
 342
 343     for(size_t i = block_size; i < count; i *= 2){
 344         result_in_temporary_buffer = !result_in_temporary_buffer;
 345         if(result_in_temporary_buffer) {
 346             merge_blocks(keys_first, values_first,
 347                          keys_temp.begin(), values_temp.begin(),
 348                          compare, count, i, true, queue);
 349         } else {
 350             merge_blocks(keys_temp.begin(), values_temp.begin(),
 351                          keys_first, values_first,
 352                          compare, count, i, true, queue);
 353         }
 354     }
 355
 356     if(result_in_temporary_buffer) {
 357         copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
 358         copy(values_temp.begin(), values_temp.end(), values_first, queue);
 359     }
 360 }
 361
 362 } // end detail namespace
 363 } // end compute namespace
 364 } // end boost namespace
 365
 366 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP