]>
git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/example/matrix_transpose.cpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
14 #include <boost/program_options.hpp>
16 #include <boost/compute/core.hpp>
17 #include <boost/compute/algorithm/copy.hpp>
18 #include <boost/compute/container/vector.hpp>
19 #include <boost/compute/type_traits/type_name.hpp>
20 #include <boost/compute/utility/source.hpp>
22 namespace compute
= boost::compute
;
23 namespace po
= boost::program_options
;
27 const uint_ TILE_DIM
= 32;
28 const uint_ BLOCK_ROWS
= 8;
30 // generate a copy kernel program
31 compute::kernel
make_copy_kernel(const compute::context
& context
)
33 // source for the copy_kernel program
34 const char source
[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
35 __kernel
void copy_kernel(__global
const float *src
, __global
float *dst
)
37 uint x
= get_group_id(0) * TILE_DIM
+ get_local_id(0);
38 uint y
= get_group_id(1) * TILE_DIM
+ get_local_id(1);
40 uint width
= get_num_groups(0) * TILE_DIM
;
42 for(uint i
= 0 ; i
< TILE_DIM
; i
+= BLOCK_ROWS
){
43 dst
[(y
+i
)*width
+x
] = src
[(y
+i
)*width
+ x
];
48 // setup compilation flags for the copy program
49 std::stringstream options
;
50 options
<< "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS
;
52 // create and build the copy program
53 compute::program program
=
54 compute::program::build_with_source(source
, context
, options
.str());
56 // create and return the copy kernel
57 return program
.create_kernel("copy_kernel");
60 // generate a naive transpose kernel
61 compute::kernel
make_naive_transpose_kernel(const compute::context
& context
)
63 // source for the naive_transpose kernel
64 const char source
[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
65 __kernel
void naive_transpose(__global
const float *src
, __global
float *dst
)
67 uint x
= get_group_id(0) * TILE_DIM
+ get_local_id(0);
68 uint y
= get_group_id(1) * TILE_DIM
+ get_local_id(1);
70 uint width
= get_num_groups(0) * TILE_DIM
;
72 for(uint i
= 0 ; i
< TILE_DIM
; i
+= BLOCK_ROWS
){
73 dst
[x
*width
+ y
+i
] = src
[(y
+i
)*width
+ x
];
78 // setup compilation flags for the naive_transpose program
79 std::stringstream options
;
80 options
<< "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS
;
82 // create and build the naive_transpose program
83 compute::program program
=
84 compute::program::build_with_source(source
, context
, options
.str());
86 // create and return the naive_transpose kernel
87 return program
.create_kernel("naive_transpose");
90 // generates a coalesced transpose kernel
91 compute::kernel
make_coalesced_transpose_kernel(const compute::context
& context
)
93 // source for the coalesced_transpose kernel
94 const char source
[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
95 __kernel
void coalesced_transpose(__global
const float *src
, __global
float *dst
)
97 __local
float tile
[TILE_DIM
][TILE_DIM
];
100 uint x
= get_group_id(0) * TILE_DIM
+ get_local_id(0);
101 uint y
= get_group_id(1) * TILE_DIM
+ get_local_id(1);
103 uint width
= get_num_groups(0) * TILE_DIM
;
105 // load inside local memory
106 for(uint i
= 0 ; i
< TILE_DIM
; i
+= BLOCK_ROWS
){
107 tile
[get_local_id(1)+i
][get_local_id(0)] = src
[(y
+i
)*width
+ x
];
110 barrier(CLK_LOCAL_MEM_FENCE
);
113 x
= get_group_id(1) * TILE_DIM
+ get_local_id(0);
114 y
= get_group_id(0) * TILE_DIM
+ get_local_id(1);
116 // write output from local memory
117 for(uint i
= 0 ; i
< TILE_DIM
; i
+=BLOCK_ROWS
){
118 dst
[(y
+i
)*width
+ x
] = tile
[get_local_id(0)][get_local_id(1)+i
];
123 // setup compilation flags for the coalesced_transpose program
124 std::stringstream options
;
125 options
<< "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS
;
127 // create and build the coalesced_transpose program
128 compute::program program
=
129 compute::program::build_with_source(source
, context
, options
.str());
131 // create and return coalesced_transpose kernel
132 return program
.create_kernel("coalesced_transpose");
135 // generate a coalesced withtout bank conflicts kernel
136 compute::kernel
make_coalesced_no_bank_conflicts_kernel(const compute::context
& context
)
138 const char source
[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
139 __kernel
void coalesced_no_bank_conflicts(__global
const float *src
, __global
float *dst
)
141 // TILE_DIM+1 is here to avoid bank conflicts in local memory
142 __local
float tile
[TILE_DIM
][TILE_DIM
+1];
145 uint x
= get_group_id(0) * TILE_DIM
+ get_local_id(0);
146 uint y
= get_group_id(1) * TILE_DIM
+ get_local_id(1);
148 uint width
= get_num_groups(0) * TILE_DIM
;
150 // load inside local memory
151 for(uint i
= 0 ; i
< TILE_DIM
; i
+= BLOCK_ROWS
){
152 tile
[get_local_id(1)+i
][get_local_id(0)] = src
[(y
+i
)*width
+ x
];
155 barrier(CLK_LOCAL_MEM_FENCE
);
158 x
= get_group_id(1) * TILE_DIM
+ get_local_id(0);
159 y
= get_group_id(0) * TILE_DIM
+ get_local_id(1);
161 // write output from local memory
162 for(uint i
= 0 ; i
< TILE_DIM
; i
+=BLOCK_ROWS
){
163 dst
[(y
+i
)*width
+ x
] = tile
[get_local_id(0)][get_local_id(1)+i
];
168 // setup compilation flags for the coalesced_no_bank_conflicts program
169 std::stringstream options
;
170 options
<< "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS
;
172 // create and build the coalesced_no_bank_conflicts program
173 compute::program program
=
174 compute::program::build_with_source(source
, context
, options
.str());
176 // create and return the coalesced_no_bank_conflicts kernel
177 return program
.create_kernel("coalesced_no_bank_conflicts");
180 // compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
181 bool check_transposition(const std::vector
<float>& expectedResult
,
183 const std::vector
<float>& transposedMatrix
)
185 for(uint_ i
= 0 ; i
< size
; ++i
){
186 if(expectedResult
[i
] != transposedMatrix
[i
]){
187 std::cout
<< "idx = " << i
<< " , expected " << expectedResult
[i
]
188 << " , got " << transposedMatrix
[i
] << std::endl
;
189 std::cout
<< "FAILED" << std::endl
;
196 // generate a matrix inside 'in' and do the tranposition inside 'out'
197 void generate_matrix(std::vector
<float>& in
, std::vector
<float>& out
, uint_ rows
, uint_ cols
)
200 for(uint_ i
= 0 ; i
< rows
; ++i
){
201 for(uint_ j
= 0 ; j
< cols
; ++j
){
202 in
[i
*cols
+ j
] = i
*cols
+ j
;
206 // store transposed result
207 for(uint_ j
= 0; j
< cols
; ++j
){
208 for(uint_ i
= 0 ; i
< rows
; ++i
){
209 out
[j
*rows
+ i
] = in
[i
*cols
+ j
];
214 // neccessary for 64-bit integer on win32
216 #define uint64_t unsigned __int64
219 int main(int argc
, char *argv
[])
221 // setup command line arguments
222 po::options_description
options("options");
223 options
.add_options()
224 ("help", "show usage instructions")
225 ("rows", po::value
<uint_
>()->default_value(4096), "number of matrix rows")
226 ("cols", po::value
<uint_
>()->default_value(4096), "number of matrix columns")
229 // parse command line
230 po::variables_map vm
;
231 po::store(po::parse_command_line(argc
, argv
, options
), vm
);
234 // check command line arguments
235 if(vm
.count("help")){
236 std::cout
<< options
<< std::endl
;
240 // get number rows and columns for the matrix
241 const uint_ rows
= vm
["rows"].as
<uint_
>();
242 const uint_ cols
= vm
["cols"].as
<uint_
>();
244 // get the default device
245 compute::device device
= compute::system::default_device();
247 // print out device name and matrix information
248 std::cout
<< "Device: " << device
.name() << std::endl
;
249 std::cout
<< "Matrix Size: " << rows
<< "x" << cols
<< std::endl
;
250 std::cout
<< "Grid Size: " << rows
/TILE_DIM
<< "x" << cols
/TILE_DIM
<< " blocks" << std::endl
;
251 std::cout
<< "Local Size: " << TILE_DIM
<< "x" << BLOCK_ROWS
<< " threads" << std::endl
;
252 std::cout
<< std::endl
;
254 // On OSX this example does not work on CPU devices
255 #if defined(__APPLE__)
256 if(device
.type() & compute::device::cpu
) {
257 std::cout
<< "On OSX this example does not work on CPU devices" << std::endl
;
262 const size_t global_work_size
[2] = {rows
, cols
*BLOCK_ROWS
/TILE_DIM
};
263 const size_t local_work_size
[2] = {TILE_DIM
, BLOCK_ROWS
};
265 // setup input data on the host
266 const uint_ size
= rows
* cols
;
267 std::vector
<float> h_input(size
);
268 std::vector
<float> h_output(size
);
269 std::vector
<float> expectedResult(size
);
270 generate_matrix(h_input
, expectedResult
, rows
, cols
);
272 // create a context for the device
273 compute::context
context(device
);
276 compute::vector
<float> d_input(size
, context
);
277 compute::vector
<float> d_output(size
, context
);
279 // command_queue with profiling
280 compute::command_queue
queue(context
, device
, compute::command_queue::enable_profiling
);
283 compute::copy(h_input
.begin(), h_input
.end(), d_input
.begin(), queue
);
285 // simple copy kernel
286 std::cout
<< "Testing copy_kernel:" << std::endl
;
287 compute::kernel kernel
= make_copy_kernel(context
);
288 kernel
.set_arg(0, d_input
);
289 kernel
.set_arg(1, d_output
);
291 compute::event start
;
292 start
= queue
.enqueue_nd_range_kernel(kernel
, 2, 0, global_work_size
, local_work_size
);
294 uint64_t elapsed
= start
.duration
<boost::chrono::nanoseconds
>().count();
296 std::cout
<< " Elapsed: " << elapsed
<< " ns" << std::endl
;
297 std::cout
<< " BandWidth: " << 2*rows
*cols
*sizeof(float) / elapsed
<< " GB/s" << std::endl
;
298 compute::copy(d_output
.begin(), d_output
.end(), h_output
.begin(), queue
);
300 check_transposition(h_input
, rows
*cols
, h_output
);
301 std::cout
<< std::endl
;
303 // naive_transpose kernel
304 std::cout
<< "Testing naive_transpose:" << std::endl
;
305 kernel
= make_naive_transpose_kernel(context
);
306 kernel
.set_arg(0, d_input
);
307 kernel
.set_arg(1, d_output
);
309 start
= queue
.enqueue_nd_range_kernel(kernel
, 2, 0, global_work_size
, local_work_size
);
311 elapsed
= start
.duration
<boost::chrono::nanoseconds
>().count();
312 std::cout
<< " Elapsed: " << elapsed
<< " ns" << std::endl
;
313 std::cout
<< " BandWidth: " << 2*rows
*cols
*sizeof(float) / elapsed
<< " GB/s" << std::endl
;
314 compute::copy(d_output
.begin(), d_output
.end(), h_output
.begin(), queue
);
316 check_transposition(expectedResult
, rows
*cols
, h_output
);
317 std::cout
<< std::endl
;
319 // coalesced_transpose kernel
320 std::cout
<< "Testing coalesced_transpose:" << std::endl
;
321 kernel
= make_coalesced_transpose_kernel(context
);
322 kernel
.set_arg(0, d_input
);
323 kernel
.set_arg(1, d_output
);
325 start
= queue
.enqueue_nd_range_kernel(kernel
, 2, 0, global_work_size
, local_work_size
);
327 elapsed
= start
.duration
<boost::chrono::nanoseconds
>().count();
328 std::cout
<< " Elapsed: " << elapsed
<< " ns" << std::endl
;
329 std::cout
<< " BandWidth: " << 2*rows
*cols
*sizeof(float) / elapsed
<< " GB/s" << std::endl
;
331 compute::copy(d_output
.begin(), d_output
.end(), h_output
.begin(), queue
);
333 check_transposition(expectedResult
, rows
*cols
, h_output
);
334 std::cout
<< std::endl
;
336 // coalesced_no_bank_conflicts kernel
337 std::cout
<< "Testing coalesced_no_bank_conflicts:" << std::endl
;
339 kernel
= make_coalesced_no_bank_conflicts_kernel(context
);
340 kernel
.set_arg(0, d_input
);
341 kernel
.set_arg(1, d_output
);
343 start
= queue
.enqueue_nd_range_kernel(kernel
, 2, 0, global_work_size
, local_work_size
);
345 elapsed
= start
.duration
<boost::chrono::nanoseconds
>().count();
346 std::cout
<< " Elapsed: " << elapsed
<< " ns" << std::endl
;
347 std::cout
<< " BandWidth: " << 2*rows
*cols
*sizeof(float) / elapsed
<< " GB/s" << std::endl
;
349 compute::copy(d_output
.begin(), d_output
.end(), h_output
.begin(), queue
);
351 check_transposition(expectedResult
, rows
*cols
, h_output
);
352 std::cout
<< std::endl
;