[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / reduce_on_gpu.hpp

//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP

#include <iterator>

#include <boost/compute/utility/source.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/vendor.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/detail/work_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/utility/program_cache.hpp>

namespace boost {
namespace compute {
namespace detail {

/// \internal
/// body reduction inside a warp
template<typename T,bool isNvidiaDevice>
struct ReduceBody
{
    static std::string body()
    {
        std::stringstream k;
        // local reduction
        k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
             "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
             "   uint mask = (i << 1) - 1;\n"      <<
             "   if((lid & mask) == 0){\n"         <<
             "       scratch[lid] += scratch[lid+i];\n" <<
             "   }\n" <<
            "}\n";
        return k.str();
    }
};

/// \internal
/// body reduction inside a warp
/// for nvidia device we can use the "unsafe"
/// memory optimisation
template<typename T>
struct ReduceBody<T,true>
{
    static std::string body()
    {
        std::stringstream k;
        // local reduction
        // we use TPB to compile only useful instruction
        // local reduction when size is greater than warp size
        k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "if(TPB >= 1024){\n" <<
            "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 512){\n" <<
            "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 256){\n" <<
            "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 128){\n" <<
            "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<

        // warp reduction
        "if(lid < 32){\n" <<
            // volatile this way we don't need any barrier
            "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
            "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
            "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
            "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
            "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
            "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
            "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
        "}\n";
        return k.str();
    }
};

template<class InputIterator, class Function>
inline void initial_reduce(InputIterator first,
                           InputIterator last,
                           buffer result,
                           const Function &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;
    (void) reduce_kernel;

    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;

    size_t count = std::distance(first, last);
    detail::meta_kernel k("initial_reduce");
    k.add_set_arg<const uint_>("count", uint_(count));
    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");

    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<

        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(offset + lid + i*TPB < count){\n" <<
        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n" <<

        // local reduction
        ReduceBody<T,false>::body() <<

        // write sum to output
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n";

    const context &context = queue.get_context();
    std::stringstream options;
    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
    kernel generic_reduce_kernel = k.compile(context, options.str());
    generic_reduce_kernel.set_arg(output_arg, result);

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}

template<class T>
inline void initial_reduce(const buffer_iterator<T> &first,
                           const buffer_iterator<T> &last,
                           const buffer &result,
                           const plus<T> &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;

    size_t count = std::distance(first, last);

    reduce_kernel.set_arg(0, first.get_buffer());
    reduce_kernel.set_arg(1, uint_(first.get_index()));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result);
    reduce_kernel.set_arg(4, uint_(0));

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
}

template<class InputIterator, class T, class Function>
inline void reduce_on_gpu(InputIterator first,
                          InputIterator last,
                          buffer_iterator<T> result,
                          Function function,
                          command_queue &queue)
{
    const device &device = queue.get_device();
    const context &context = queue.get_context();

    detail::meta_kernel k("reduce");
    k.add_arg<const T*>(memory_object::global_memory, "input");
    k.add_arg<const uint_>("offset");
    k.add_arg<const uint_>("count");
    k.add_arg<T*>(memory_object::global_memory, "output");
    k.add_arg<const uint_>("output_offset");

    k <<
        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<
        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(block_offset + lid + i*TPB < count){\n" <<
        "        sum = sum + block[lid+i*TPB]; \n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n";

    // discrimination on vendor name
    if(is_nvidia_device(device))
        k << ReduceBody<T,true>::body();
    else
        k << ReduceBody<T,false>::body();

    k <<
        // write sum to output
         "if(lid == 0){\n" <<
         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
         "}\n";

    std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();

    // load parameters
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    uint_ vpt = parameters->get(cache_key, "vpt", 8);
    uint_ tpb = parameters->get(cache_key, "tpb", 128);

    // reduce program compiler flags
    std::stringstream options;
    options << "-DT=" << type_name<T>()
            << " -DVPT=" << vpt
            << " -DTPB=" << tpb;

    // load program
    boost::shared_ptr<program_cache> cache =
        program_cache::get_global_cache(context);

    program reduce_program = cache->get_or_build(
        cache_key, options.str(), k.source(), context
    );

    // create reduce kernel
    kernel reduce_kernel(reduce_program, "reduce");

    size_t count = std::distance(first, last);

    // first pass, reduce from input to ping
    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);

    // update count after initial reduce
    count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));

    // middle pass(es), reduce between ping and pong
    const buffer *input_buffer = &ping;
    buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
    const buffer *output_buffer = &pong;
    if(count > vpt * tpb){
        while(count > vpt * tpb){
            reduce_kernel.set_arg(0, *input_buffer);
            reduce_kernel.set_arg(1, uint_(0));
            reduce_kernel.set_arg(2, uint_(count));
            reduce_kernel.set_arg(3, *output_buffer);
            reduce_kernel.set_arg(4, uint_(0));

            size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
            if(work_size % tpb != 0){
                work_size += tpb - work_size % tpb;
            }
            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);

            std::swap(input_buffer, output_buffer);
            count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
        }
    }

    // final pass, reduce from ping/pong to result
    reduce_kernel.set_arg(0, *input_buffer);
    reduce_kernel.set_arg(1, uint_(0));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result.get_buffer());
    reduce_kernel.set_arg(4, uint_(result.get_index()));

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
Commit	Line	Data
7c673cae FG	1	//---------------------------------------------------------------------------//
	2	// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
	3	//
	4	// Distributed under the Boost Software License, Version 1.0
	5	// See accompanying file LICENSE_1_0.txt or copy at
	6	// http://www.boost.org/LICENSE_1_0.txt
	7	//
	8	// See http://boostorg.github.com/compute for more information.
	9	//---------------------------------------------------------------------------//
	10
	11	#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
	12	#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
	13
	14	#include <iterator>
	15
	16	#include <boost/compute/utility/source.hpp>
	17	#include <boost/compute/program.hpp>
	18	#include <boost/compute/command_queue.hpp>
	19	#include <boost/compute/detail/vendor.hpp>
	20	#include <boost/compute/detail/parameter_cache.hpp>
	21	#include <boost/compute/detail/work_size.hpp>
	22	#include <boost/compute/detail/meta_kernel.hpp>
	23	#include <boost/compute/type_traits/type_name.hpp>
	24	#include <boost/compute/utility/program_cache.hpp>
	25
	26	namespace boost {
	27	namespace compute {
	28	namespace detail {
	29
	30	/// \internal
	31	/// body reduction inside a warp
	32	template<typename T,bool isNvidiaDevice>
	33	struct ReduceBody
	34	{
	35	static std::string body()
	36	{
	37	std::stringstream k;
	38	// local reduction
	39	k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
	40	" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
	41	" uint mask = (i << 1) - 1;\n" <<
	42	" if((lid & mask) == 0){\n" <<
	43	" scratch[lid] += scratch[lid+i];\n" <<
	44	" }\n" <<
	45	"}\n";
	46	return k.str();
	47	}
	48	};
	49
	50	/// \internal
	51	/// body reduction inside a warp
	52	/// for nvidia device we can use the "unsafe"
	53	/// memory optimisation
	54	template<typename T>
	55	struct ReduceBody<T,true>
	56	{
	57	static std::string body()
	58	{
	59	std::stringstream k;
	60	// local reduction
	61	// we use TPB to compile only useful instruction
	62	// local reduction when size is greater than warp size
	63	k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
	64	"if(TPB >= 1024){\n" <<
65	"if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
66	"if(TPB >= 512){\n" <<
67	"if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
68	"if(TPB >= 256){\n" <<
69	"if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
70	"if(TPB >= 128){\n" <<
71	"if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
72
73	// warp reduction
74	"if(lid < 32){\n" <<
75	// volatile this way we don't need any barrier
76	"volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
77	"if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
78	"if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
79	"if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
80	"if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
81	"if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
82	"if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
83	"}\n";
84	return k.str();
85	}
86	};
87
88	template<class InputIterator, class Function>
89	inline void initial_reduce(InputIterator first,
90	InputIterator last,
91	buffer result,
92	const Function &function,
93	kernel &reduce_kernel,
94	const uint_ vpt,
95	const uint_ tpb,
96	command_queue &queue)
97	{
98	(void) function;
99	(void) reduce_kernel;
100
101	typedef typename std::iterator_traits<InputIterator>::value_type Arg;
102	typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
103
104	size_t count = std::distance(first, last);
105	detail::meta_kernel k("initial_reduce");
106	k.add_set_arg<const uint_>("count", uint_(count));
107	size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
108
109	k <<
110	k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
111	k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
112
113	"__local " << type_name<T>() << " scratch[TPB];\n" <<
114
115	// private reduction
116	k.decl<T>("sum") << " = 0;\n" <<
117	"for(uint i = 0; i < VPT; i++){\n" <<
118	" if(offset + lid + i*TPB < count){\n" <<
119	" sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
120	" }\n" <<
121	"}\n" <<
122
123	"scratch[lid] = sum;\n" <<
124
125	// local reduction
126	ReduceBody<T,false>::body() <<
127
128	// write sum to output
129	"if(lid == 0){\n" <<
130	" output[get_group_id(0)] = scratch[0];\n" <<
131	"}\n";
132
133	const context &context = queue.get_context();
134	std::stringstream options;
135	options << "-DVPT=" << vpt << " -DTPB=" << tpb;
136	kernel generic_reduce_kernel = k.compile(context, options.str());
137	generic_reduce_kernel.set_arg(output_arg, result);
138
139	size_t work_size = calculate_work_size(count, vpt, tpb);
140
141	queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
142	}
143
144	template<class T>
145	inline void initial_reduce(const buffer_iterator<T> &first,
146	const buffer_iterator<T> &last,
147	const buffer &result,
148	const plus<T> &function,
149	kernel &reduce_kernel,
150	const uint_ vpt,
151	const uint_ tpb,
152	command_queue &queue)
153	{
154	(void) function;
155
156	size_t count = std::distance(first, last);
157
158	reduce_kernel.set_arg(0, first.get_buffer());
159	reduce_kernel.set_arg(1, uint_(first.get_index()));
160	reduce_kernel.set_arg(2, uint_(count));
161	reduce_kernel.set_arg(3, result);
162	reduce_kernel.set_arg(4, uint_(0));
163
164	size_t work_size = calculate_work_size(count, vpt, tpb);
165
166	queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
167	}
168
169	template<class InputIterator, class T, class Function>
170	inline void reduce_on_gpu(InputIterator first,
171	InputIterator last,
172	buffer_iterator<T> result,
173	Function function,
174	command_queue &queue)
175	{
176	const device &device = queue.get_device();
177	const context &context = queue.get_context();
178
179	detail::meta_kernel k("reduce");
180	k.add_arg<const T*>(memory_object::global_memory, "input");
181	k.add_arg<const uint_>("offset");
182	k.add_arg<const uint_>("count");
183	k.add_arg<T*>(memory_object::global_memory, "output");
184	k.add_arg<const uint_>("output_offset");
185
186	k <<
187	k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
188	"__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
189	k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
190
191	"__local " << type_name<T>() << " scratch[TPB];\n" <<
192	// private reduction
193	k.decl<T>("sum") << " = 0;\n" <<
194	"for(uint i = 0; i < VPT; i++){\n" <<
195	" if(block_offset + lid + i*TPB < count){\n" <<
196	" sum = sum + block[lid+i*TPB]; \n" <<
197	" }\n" <<
198	"}\n" <<
199
200	"scratch[lid] = sum;\n";
201
202	// discrimination on vendor name
203	if(is_nvidia_device(device))
204	k << ReduceBody<T,true>::body();
205	else
206	k << ReduceBody<T,false>::body();
207
208	k <<
209	// write sum to output
210	"if(lid == 0){\n" <<
211	" output[output_offset + get_group_id(0)] = scratch[0];\n" <<
212	"}\n";
213
214	std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
215
216	// load parameters
217	boost::shared_ptr<parameter_cache> parameters =
218	detail::parameter_cache::get_global_cache(device);
219
220	uint_ vpt = parameters->get(cache_key, "vpt", 8);
221	uint_ tpb = parameters->get(cache_key, "tpb", 128);
222
223	// reduce program compiler flags
224	std::stringstream options;
225	options << "-DT=" << type_name<T>()
226	<< " -DVPT=" << vpt
227	<< " -DTPB=" << tpb;
228
229	// load program
230	boost::shared_ptr<program_cache> cache =
231	program_cache::get_global_cache(context);
232
233	program reduce_program = cache->get_or_build(
234	cache_key, options.str(), k.source(), context
235	);
236
237	// create reduce kernel
238	kernel reduce_kernel(reduce_program, "reduce");
239
240	size_t count = std::distance(first, last);
241
242	// first pass, reduce from input to ping
243	buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
244	initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
245
246	// update count after initial reduce
247	count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
248
249	// middle pass(es), reduce between ping and pong
250	const buffer *input_buffer = &ping;
251	buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
252	const buffer *output_buffer = &pong;
253	if(count > vpt * tpb){
254	while(count > vpt * tpb){
255	reduce_kernel.set_arg(0, *input_buffer);
256	reduce_kernel.set_arg(1, uint_(0));
257	reduce_kernel.set_arg(2, uint_(count));
258	reduce_kernel.set_arg(3, *output_buffer);
259	reduce_kernel.set_arg(4, uint_(0));
260
261	size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
262	if(work_size % tpb != 0){
263	work_size += tpb - work_size % tpb;
264	}
265	queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
266
267	std::swap(input_buffer, output_buffer);
268	count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
269	}
270	}
271
272	// final pass, reduce from ping/pong to result
273	reduce_kernel.set_arg(0, *input_buffer);
274	reduce_kernel.set_arg(1, uint_(0));
275	reduce_kernel.set_arg(2, uint_(count));
276	reduce_kernel.set_arg(3, result.get_buffer());
277	reduce_kernel.set_arg(4, uint_(result.get_index()));
278
279	queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
280	}
281
282	} // end detail namespace
283	} // end compute namespace
284	} // end boost namespace
285
286	#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP