]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/example/matrix_transpose.cpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / compute / example / matrix_transpose.cpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #include <iostream>
12 #include <cstdlib>
13
14 #include <boost/program_options.hpp>
15
16 #include <boost/compute/core.hpp>
17 #include <boost/compute/algorithm/copy.hpp>
18 #include <boost/compute/container/vector.hpp>
19 #include <boost/compute/type_traits/type_name.hpp>
20 #include <boost/compute/utility/source.hpp>
21
22 namespace compute = boost::compute;
23 namespace po = boost::program_options;
24
25 using compute::uint_;
26
27 const uint_ TILE_DIM = 32;
28 const uint_ BLOCK_ROWS = 8;
29
30 // generate a copy kernel program
31 compute::kernel make_copy_kernel(const compute::context& context)
32 {
33 // source for the copy_kernel program
34 const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
35 __kernel void copy_kernel(__global const float *src, __global float *dst)
36 {
37 uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
38 uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
39
40 uint width = get_num_groups(0) * TILE_DIM;
41
42 for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
43 dst[(y+i)*width +x] = src[(y+i)*width + x];
44 }
45 }
46 );
47
48 // setup compilation flags for the copy program
49 std::stringstream options;
50 options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
51
52 // create and build the copy program
53 compute::program program =
54 compute::program::build_with_source(source, context, options.str());
55
56 // create and return the copy kernel
57 return program.create_kernel("copy_kernel");
58 }
59
60 // generate a naive transpose kernel
61 compute::kernel make_naive_transpose_kernel(const compute::context& context)
62 {
63 // source for the naive_transpose kernel
64 const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
65 __kernel void naive_transpose(__global const float *src, __global float *dst)
66 {
67 uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
68 uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
69
70 uint width = get_num_groups(0) * TILE_DIM;
71
72 for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
73 dst[x*width + y+i] = src[(y+i)*width + x];
74 }
75 }
76 );
77
78 // setup compilation flags for the naive_transpose program
79 std::stringstream options;
80 options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
81
82 // create and build the naive_transpose program
83 compute::program program =
84 compute::program::build_with_source(source, context, options.str());
85
86 // create and return the naive_transpose kernel
87 return program.create_kernel("naive_transpose");
88 }
89
90 // generates a coalesced transpose kernel
91 compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
92 {
93 // source for the coalesced_transpose kernel
94 const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
95 __kernel void coalesced_transpose(__global const float *src, __global float *dst)
96 {
97 __local float tile[TILE_DIM][TILE_DIM];
98
99 // compute indexes
100 uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
101 uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
102
103 uint width = get_num_groups(0) * TILE_DIM;
104
105 // load inside local memory
106 for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
107 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
108 }
109
110 barrier(CLK_LOCAL_MEM_FENCE);
111
112 // transpose indexes
113 x = get_group_id(1) * TILE_DIM + get_local_id(0);
114 y = get_group_id(0) * TILE_DIM + get_local_id(1);
115
116 // write output from local memory
117 for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
118 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
119 }
120 }
121 );
122
123 // setup compilation flags for the coalesced_transpose program
124 std::stringstream options;
125 options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
126
127 // create and build the coalesced_transpose program
128 compute::program program =
129 compute::program::build_with_source(source, context, options.str());
130
131 // create and return coalesced_transpose kernel
132 return program.create_kernel("coalesced_transpose");
133 }
134
135 // generate a coalesced withtout bank conflicts kernel
136 compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
137 {
138 const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
139 __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
140 {
141 // TILE_DIM+1 is here to avoid bank conflicts in local memory
142 __local float tile[TILE_DIM][TILE_DIM+1];
143
144 // compute indexes
145 uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
146 uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
147
148 uint width = get_num_groups(0) * TILE_DIM;
149
150 // load inside local memory
151 for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
152 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
153 }
154
155 barrier(CLK_LOCAL_MEM_FENCE);
156
157 // transpose indexes
158 x = get_group_id(1) * TILE_DIM + get_local_id(0);
159 y = get_group_id(0) * TILE_DIM + get_local_id(1);
160
161 // write output from local memory
162 for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
163 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
164 }
165 }
166 );
167
168 // setup compilation flags for the coalesced_no_bank_conflicts program
169 std::stringstream options;
170 options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
171
172 // create and build the coalesced_no_bank_conflicts program
173 compute::program program =
174 compute::program::build_with_source(source, context, options.str());
175
176 // create and return the coalesced_no_bank_conflicts kernel
177 return program.create_kernel("coalesced_no_bank_conflicts");
178 }
179
180 // compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
181 bool check_transposition(const std::vector<float>& expectedResult,
182 uint_ size,
183 const std::vector<float>& transposedMatrix)
184 {
185 for(uint_ i = 0 ; i < size ; ++i){
186 if(expectedResult[i] != transposedMatrix[i]){
187 std::cout << "idx = " << i << " , expected " << expectedResult[i]
188 << " , got " << transposedMatrix[i] << std::endl;
189 std::cout << "FAILED" << std::endl;
190 return false;
191 }
192 }
193 return true;
194 }
195
196 // generate a matrix inside 'in' and do the tranposition inside 'out'
197 void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
198 {
199 // generate a matrix
200 for(uint_ i = 0 ; i < rows ; ++i){
201 for(uint_ j = 0 ; j < cols ; ++j){
202 in[i*cols + j] = i*cols + j;
203 }
204 }
205
206 // store transposed result
207 for(uint_ j = 0; j < cols ; ++j){
208 for(uint_ i = 0 ; i < rows ; ++i){
209 out[j*rows + i] = in[i*cols + j];
210 }
211 }
212 }
213
214 // neccessary for 64-bit integer on win32
215 #ifdef _WIN32
216 #define uint64_t unsigned __int64
217 #endif
218
219 int main(int argc, char *argv[])
220 {
221 // setup command line arguments
222 po::options_description options("options");
223 options.add_options()
224 ("help", "show usage instructions")
225 ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
226 ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
227 ;
228
229 // parse command line
230 po::variables_map vm;
231 po::store(po::parse_command_line(argc, argv, options), vm);
232 po::notify(vm);
233
234 // check command line arguments
235 if(vm.count("help")){
236 std::cout << options << std::endl;
237 return 0;
238 }
239
240 // get number rows and columns for the matrix
241 const uint_ rows = vm["rows"].as<uint_>();
242 const uint_ cols = vm["cols"].as<uint_>();
243
244 // get the default device
245 compute::device device = compute::system::default_device();
246
247 // print out device name and matrix information
248 std::cout << "Device: " << device.name() << std::endl;
249 std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
250 std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
251 std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
252 std::cout << std::endl;
253
254 // On OSX this example does not work on CPU devices
255 #if defined(__APPLE__)
256 if(device.type() & compute::device::cpu) {
257 std::cout << "On OSX this example does not work on CPU devices" << std::endl;
258 return 0;
259 }
260 #endif
261
262 const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
263 const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
264
265 // setup input data on the host
266 const uint_ size = rows * cols;
267 std::vector<float> h_input(size);
268 std::vector<float> h_output(size);
269 std::vector<float> expectedResult(size);
270 generate_matrix(h_input, expectedResult, rows, cols);
271
272 // create a context for the device
273 compute::context context(device);
274
275 // device vectors
276 compute::vector<float> d_input(size, context);
277 compute::vector<float> d_output(size, context);
278
279 // command_queue with profiling
280 compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
281
282 // copy input data
283 compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
284
285 // simple copy kernel
286 std::cout << "Testing copy_kernel:" << std::endl;
287 compute::kernel kernel = make_copy_kernel(context);
288 kernel.set_arg(0, d_input);
289 kernel.set_arg(1, d_output);
290
291 compute::event start;
292 start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
293 queue.finish();
294 uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
295
296 std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
297 std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
298 compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
299
300 check_transposition(h_input, rows*cols, h_output);
301 std::cout << std::endl;
302
303 // naive_transpose kernel
304 std::cout << "Testing naive_transpose:" << std::endl;
305 kernel = make_naive_transpose_kernel(context);
306 kernel.set_arg(0, d_input);
307 kernel.set_arg(1, d_output);
308
309 start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
310 queue.finish();
311 elapsed = start.duration<boost::chrono::nanoseconds>().count();
312 std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
313 std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
314 compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
315
316 check_transposition(expectedResult, rows*cols, h_output);
317 std::cout << std::endl;
318
319 // coalesced_transpose kernel
320 std::cout << "Testing coalesced_transpose:" << std::endl;
321 kernel = make_coalesced_transpose_kernel(context);
322 kernel.set_arg(0, d_input);
323 kernel.set_arg(1, d_output);
324
325 start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
326 queue.finish();
327 elapsed = start.duration<boost::chrono::nanoseconds>().count();
328 std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
329 std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
330
331 compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
332
333 check_transposition(expectedResult, rows*cols, h_output);
334 std::cout << std::endl;
335
336 // coalesced_no_bank_conflicts kernel
337 std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
338
339 kernel = make_coalesced_no_bank_conflicts_kernel(context);
340 kernel.set_arg(0, d_input);
341 kernel.set_arg(1, d_output);
342
343 start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
344 queue.finish();
345 elapsed = start.duration<boost::chrono::nanoseconds>().count();
346 std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
347 std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
348
349 compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
350
351 check_transposition(expectedResult, rows*cols, h_output);
352 std::cout << std::endl;
353
354 return 0;
355 }