ceph/src/boost/libs/compute/example/matrix_transpose.cpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #include <iostream>
  12 #include <cstdlib>
  13
  14 #include <boost/program_options.hpp>
  15
  16 #include <boost/compute/core.hpp>
  17 #include <boost/compute/algorithm/copy.hpp>
  18 #include <boost/compute/container/vector.hpp>
  19 #include <boost/compute/type_traits/type_name.hpp>
  20 #include <boost/compute/utility/source.hpp>
  21
  22 namespace compute = boost::compute;
  23 namespace po = boost::program_options;
  24
  25 using compute::uint_;
  26
  27 const uint_ TILE_DIM = 32;
  28 const uint_ BLOCK_ROWS = 8;
  29
  30 // generate a copy kernel program
  31 compute::kernel make_copy_kernel(const compute::context& context)
  32 {
  33     // source for the copy_kernel program
  34     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  35         __kernel void copy_kernel(__global const float *src, __global float *dst)
  36         {
  37             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  38             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  39
  40             uint width = get_num_groups(0) * TILE_DIM;
  41
  42             for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
  43                 dst[(y+i)*width +x] = src[(y+i)*width + x];
  44             }
  45         }
  46     );
  47
  48     // setup compilation flags for the copy program
  49     std::stringstream options;
  50     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  51
  52     // create and build the copy program
  53     compute::program program =
  54         compute::program::build_with_source(source, context, options.str());
  55
  56     // create and return the copy kernel
  57     return program.create_kernel("copy_kernel");
  58 }
  59
  60 // generate a naive transpose kernel
  61 compute::kernel make_naive_transpose_kernel(const compute::context& context)
  62 {
  63     // source for the naive_transpose kernel
  64     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  65         __kernel void naive_transpose(__global const float *src, __global float *dst)
  66         {
  67             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  68             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  69
  70             uint width = get_num_groups(0) * TILE_DIM;
  71
  72             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
  73                 dst[x*width + y+i] = src[(y+i)*width + x];
  74             }
  75         }
  76     );
  77
  78     // setup compilation flags for the naive_transpose program
  79     std::stringstream options;
  80     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  81
  82     // create and build the naive_transpose program
  83     compute::program program =
  84         compute::program::build_with_source(source, context, options.str());
  85
  86     // create and return the naive_transpose kernel
  87     return program.create_kernel("naive_transpose");
  88 }
  89
  90 // generates a coalesced transpose kernel
  91 compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
  92 {
  93     // source for the coalesced_transpose kernel
  94     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  95         __kernel void coalesced_transpose(__global const float *src, __global float *dst)
  96         {
  97             __local float tile[TILE_DIM][TILE_DIM];
  98
  99             // compute indexes
 100             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
 101             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
 102
 103             uint width = get_num_groups(0) * TILE_DIM;
 104
 105             // load inside local memory
 106             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
 107                 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
 108             }
 109
 110             barrier(CLK_LOCAL_MEM_FENCE);
 111
 112             // transpose indexes
 113             x = get_group_id(1) * TILE_DIM + get_local_id(0);
 114             y = get_group_id(0) * TILE_DIM + get_local_id(1);
 115
 116             // write output from local memory
 117             for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
 118                 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
 119             }
 120         }
 121     );
 122
 123     // setup compilation flags for the coalesced_transpose program
 124     std::stringstream options;
 125     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
 126
 127     // create and build the coalesced_transpose program
 128     compute::program program =
 129         compute::program::build_with_source(source, context, options.str());
 130
 131     // create and return coalesced_transpose kernel
 132     return program.create_kernel("coalesced_transpose");
 133 }
 134
 135 // generate a coalesced withtout bank conflicts kernel
 136 compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
 137 {
 138     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
 139         __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
 140         {
 141             // TILE_DIM+1 is here to avoid bank conflicts in local memory
 142             __local float tile[TILE_DIM][TILE_DIM+1];
 143
 144             // compute indexes
 145             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
 146             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
 147
 148             uint width = get_num_groups(0) * TILE_DIM;
 149
 150             // load inside local memory
 151             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
 152                 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
 153             }
 154
 155             barrier(CLK_LOCAL_MEM_FENCE);
 156
 157             // transpose indexes
 158             x = get_group_id(1) * TILE_DIM + get_local_id(0);
 159             y = get_group_id(0) * TILE_DIM + get_local_id(1);
 160
 161             // write output from local memory
 162             for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
 163                 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
 164             }
 165         }
 166     );
 167
 168     // setup compilation flags for the coalesced_no_bank_conflicts program
 169     std::stringstream options;
 170     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
 171
 172     // create and build the coalesced_no_bank_conflicts program
 173     compute::program program =
 174         compute::program::build_with_source(source, context, options.str());
 175
 176     // create and return the coalesced_no_bank_conflicts kernel
 177     return program.create_kernel("coalesced_no_bank_conflicts");
 178 }
 179
 180 // compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
 181 bool check_transposition(const std::vector<float>& expectedResult,
 182                          uint_ size,
 183                          const std::vector<float>& transposedMatrix)
 184 {
 185     for(uint_ i = 0 ; i < size ; ++i){
 186         if(expectedResult[i] != transposedMatrix[i]){
 187             std::cout << "idx = " << i << " , expected " << expectedResult[i]
 188                       << " , got " << transposedMatrix[i] << std::endl;
 189             std::cout << "FAILED" << std::endl;
 190             return false;
 191         }
 192     }
 193     return true;
 194 }
 195
 196 // generate a matrix inside 'in' and do the tranposition inside 'out'
 197 void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
 198 {
 199     // generate a matrix
 200     for(uint_ i = 0 ; i < rows ; ++i){
 201         for(uint_ j = 0 ; j < cols ; ++j){
 202             in[i*cols + j] = i*cols + j;
 203         }
 204     }
 205
 206     // store transposed result
 207     for(uint_ j = 0; j < cols ; ++j){
 208         for(uint_ i = 0 ; i < rows ; ++i){
 209             out[j*rows + i] = in[i*cols + j];
 210         }
 211     }
 212 }
 213
 214 // neccessary for 64-bit integer on win32
 215 #ifdef _WIN32
 216 #define uint64_t unsigned __int64
 217 #endif
 218
 219 int main(int argc, char *argv[])
 220 {
 221     // setup command line arguments
 222     po::options_description options("options");
 223     options.add_options()
 224         ("help", "show usage instructions")
 225         ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
 226         ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
 227     ;
 228
 229     // parse command line
 230     po::variables_map vm;
 231     po::store(po::parse_command_line(argc, argv, options), vm);
 232     po::notify(vm);
 233
 234     // check command line arguments
 235     if(vm.count("help")){
 236         std::cout << options << std::endl;
 237         return 0;
 238     }
 239
 240     // get number rows and columns for the matrix
 241     const uint_ rows = vm["rows"].as<uint_>();
 242     const uint_ cols = vm["cols"].as<uint_>();
 243
 244     // get the default device
 245     compute::device device = compute::system::default_device();
 246
 247     // print out device name and matrix information
 248     std::cout << "Device: " << device.name() << std::endl;
 249     std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
 250     std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
 251     std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
 252     std::cout << std::endl;
 253
 254     // On OSX this example does not work on CPU devices
 255     #if defined(__APPLE__)
 256     if(device.type() & compute::device::cpu) {
 257         std::cout << "On OSX this example does not work on CPU devices" << std::endl;
 258         return 0;
 259     }
 260     #endif
 261
 262     const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
 263     const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
 264
 265     // setup input data on the host
 266     const uint_ size = rows * cols;
 267     std::vector<float> h_input(size);
 268     std::vector<float> h_output(size);
 269     std::vector<float> expectedResult(size);
 270     generate_matrix(h_input, expectedResult, rows, cols);
 271
 272     // create a context for the device
 273     compute::context context(device);
 274
 275     // device vectors
 276     compute::vector<float> d_input(size, context);
 277     compute::vector<float> d_output(size, context);
 278
 279     // command_queue with profiling
 280     compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
 281
 282     // copy input data
 283     compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
 284
 285     // simple copy kernel
 286     std::cout << "Testing copy_kernel:" << std::endl;
 287     compute::kernel kernel = make_copy_kernel(context);
 288     kernel.set_arg(0, d_input);
 289     kernel.set_arg(1, d_output);
 290
 291     compute::event start;
 292     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
 293     queue.finish();
 294     uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
 295
 296     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
 297     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
 298     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
 299
 300     check_transposition(h_input, rows*cols, h_output);
 301     std::cout << std::endl;
 302
 303     // naive_transpose kernel
 304     std::cout << "Testing naive_transpose:" << std::endl;
 305     kernel = make_naive_transpose_kernel(context);
 306     kernel.set_arg(0, d_input);
 307     kernel.set_arg(1, d_output);
 308
 309     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
 310     queue.finish();
 311     elapsed = start.duration<boost::chrono::nanoseconds>().count();
 312     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
 313     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
 314     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
 315
 316     check_transposition(expectedResult, rows*cols, h_output);
 317     std::cout << std::endl;
 318
 319     // coalesced_transpose kernel
 320     std::cout << "Testing coalesced_transpose:" << std::endl;
 321     kernel = make_coalesced_transpose_kernel(context);
 322     kernel.set_arg(0, d_input);
 323     kernel.set_arg(1, d_output);
 324
 325     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
 326     queue.finish();
 327     elapsed = start.duration<boost::chrono::nanoseconds>().count();
 328     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
 329     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
 330
 331     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
 332
 333     check_transposition(expectedResult, rows*cols, h_output);
 334     std::cout << std::endl;
 335
 336     // coalesced_no_bank_conflicts kernel
 337     std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
 338
 339     kernel = make_coalesced_no_bank_conflicts_kernel(context);
 340     kernel.set_arg(0, d_input);
 341     kernel.set_arg(1, d_output);
 342
 343     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
 344     queue.finish();
 345     elapsed = start.duration<boost::chrono::nanoseconds>().count();
 346     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
 347     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
 348
 349     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
 350
 351     check_transposition(expectedResult, rows*cols, h_output);
 352     std::cout << std::endl;
 353
 354     return 0;
 355 }