[ceph.git] / ceph / src / boost / boost / compute / algorithm / reduce.hpp

//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP

#include <iterator>

#include <boost/static_assert.hpp>

#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/array.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/algorithm/copy_n.hpp>
#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/type_traits/result_of.hpp>
#include <boost/compute/type_traits/is_device_iterator.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class InputIterator, class OutputIterator, class BinaryFunction>
size_t reduce(InputIterator first,
              size_t count,
              OutputIterator result,
              size_t block_size,
              BinaryFunction function,
              command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t block_count = count / 2 / block_size;
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));

    if(block_count != 0){
        meta_kernel k("block_reduce");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");

        k <<
            "const uint gid = get_global_id(0);\n" <<
            "const uint lid = get_local_id(0);\n" <<

            // copy values to local memory
            "block[lid] = " <<
                function(first[k.make_var<uint_>("gid*2+0")],
                         first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<

            // perform reduction
            "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "    uint mask = (i << 1) - 1;\n" <<
            "    if((lid & mask) == 0){\n" <<
            "        block[lid] = " <<
                         function(k.expr<input_type>("block[lid]"),
                                  k.expr<input_type>("block[lid+i]")) << ";\n" <<
            "    }\n" <<
            "}\n" <<

            // write block result to global output
            "if(lid == 0)\n" <<
            "    output[get_group_id(0)] = block[0];\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(block_arg, local_buffer<input_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);
    }

    // serially reduce any leftovers
    if(block_count * block_size * 2 < count){
        size_t last_block_start = block_count * block_size * 2;

        meta_kernel k("extra_serial_reduce");
        size_t count_arg = k.add_arg<uint_>("count");
        size_t offset_arg = k.add_arg<uint_>("offset");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t output_offset_arg = k.add_arg<uint_>("output_offset");

        k <<
            k.decl<result_type>("result") << " = \n" <<
                first[k.expr<uint_>("offset")] << ";\n" <<
            "for(uint i = offset + 1; i < count; i++)\n" <<
            "    result = " <<
                     function(k.var<result_type>("result"),
                              first[k.var<uint_>("i")]) << ";\n" <<
            "output[output_offset] = result;\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(count_arg, static_cast<uint_>(count));
        kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));

        queue.enqueue_task(kernel);
    }

    return total_block_count;
}

template<class InputIterator, class BinaryFunction>
inline vector<
    typename boost::compute::result_of<
        BinaryFunction(
            typename std::iterator_traits<InputIterator>::value_type,
            typename std::iterator_traits<InputIterator>::value_type
        )
    >::type
>
block_reduce(InputIterator first,
             size_t count,
             size_t block_size,
             BinaryFunction function,
             command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
    vector<result_type> result_vector(total_block_count, context);

    reduce(first, count, result_vector.begin(), block_size, function, queue);

    return result_vector;
}

// Space complexity: O( ceil(n / 2 / 256) )
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void generic_reduce(InputIterator first,
                           InputIterator last,
                           OutputIterator result,
                           BinaryFunction function,
                           command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const device &device = queue.get_device();
    const context &context = queue.get_context();

    size_t count = detail::iterator_range_size(first, last);

    if(device.type() & device::cpu){
        array<result_type, 1> value(context);
        detail::reduce_on_cpu(first, last, value.begin(), function, queue);
        boost::compute::copy_n(value.begin(), 1, result, queue);
    }
    else {
        size_t block_size = 256;

        // first pass
        vector<result_type> results = detail::block_reduce(first,
                                                           count,
                                                           block_size,
                                                           function,
                                                           queue);

        if(results.size() > 1){
            detail::inplace_reduce(results.begin(),
                                   results.end(),
                                   function,
                                   queue);
        }

        boost::compute::copy_n(results.begin(), 1, result, queue);
    }
}

template<class InputIterator, class OutputIterator, class T>
inline void dispatch_reduce(InputIterator first,
                            InputIterator last,
                            OutputIterator result,
                            const plus<T> &function,
                            command_queue &queue)
{
    const context &context = queue.get_context();
    const device &device = queue.get_device();

    // reduce to temporary buffer on device
    array<T, 1> value(context);
    if(device.type() & device::cpu){
        detail::reduce_on_cpu(first, last, value.begin(), function, queue);
    }
    else {
        reduce_on_gpu(first, last, value.begin(), function, queue);
    }

    // copy to result iterator
    copy_n(value.begin(), 1, result, queue);
}

template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void dispatch_reduce(InputIterator first,
                            InputIterator last,
                            OutputIterator result,
                            BinaryFunction function,
                            command_queue &queue)
{
    generic_reduce(first, last, result, function, queue);
}

} // end detail namespace

/// Returns the result of applying \p function to the elements in the
/// range [\p first, \p last).
///
/// If no function is specified, \c plus will be used.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param result iterator pointing to the output
/// \param function binary reduction function
/// \param queue command queue to perform the operation
///
/// The \c reduce() algorithm assumes that the binary reduction function is
/// associative. When used with non-associative functions the result may
/// be non-deterministic and vary in precision. Notably this affects the
/// \c plus<float>() function as floating-point addition is not associative
/// and may produce slightly different results than a serial algorithm.
///
/// This algorithm supports both host and device iterators for the
/// result argument. This allows for values to be reduced and copied
/// to the host all with a single function call.
///
/// For example, to calculate the sum of the values in a device vector and
/// copy the result to a value on the host:
///
/// \snippet test/test_reduce.cpp sum_int
///
/// Note that while the the \c reduce() algorithm is conceptually identical to
/// the \c accumulate() algorithm, its implementation is substantially more
/// efficient on parallel hardware. For more information, see the documentation
/// on the \c accumulate() algorithm.
///
/// Space complexity on GPUs: \Omega(n)<br>
/// Space complexity on CPUs: \Omega(1)
///
/// \see accumulate()
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void reduce(InputIterator first,
                   InputIterator last,
                   OutputIterator result,
                   BinaryFunction function,
                   command_queue &queue = system::default_queue())
{
    BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
    if(first == last){
        return;
    }

    detail::dispatch_reduce(first, last, result, function, queue);
}

/// \overload
template<class InputIterator, class OutputIterator>
inline void reduce(InputIterator first,
                   InputIterator last,
                   OutputIterator result,
                   command_queue &queue = system::default_queue())
{
    BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
    typedef typename std::iterator_traits<InputIterator>::value_type T;

    if(first == last){
        return;
    }

    detail::dispatch_reduce(first, last, result, plus<T>(), queue);
}

} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
Commit	Line	Data
7c673cae FG	1	//---------------------------------------------------------------------------//
	2	// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
	3	//
	4	// Distributed under the Boost Software License, Version 1.0
	5	// See accompanying file LICENSE_1_0.txt or copy at
	6	// http://www.boost.org/LICENSE_1_0.txt
	7	//
	8	// See http://boostorg.github.com/compute for more information.
	9	//---------------------------------------------------------------------------//
	10
	11	#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
	12	#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
	13
	14	#include <iterator>
	15
92f5a8d4 TL	16	#include <boost/static_assert.hpp>
92f5a8d4 TL	17
7c673cae FG	18	#include <boost/compute/system.hpp>
	19	#include <boost/compute/functional.hpp>
	20	#include <boost/compute/detail/meta_kernel.hpp>
	21	#include <boost/compute/command_queue.hpp>
	22	#include <boost/compute/container/array.hpp>
	23	#include <boost/compute/container/vector.hpp>
	24	#include <boost/compute/algorithm/copy_n.hpp>
	25	#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
	26	#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
	27	#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
	28	#include <boost/compute/detail/iterator_range_size.hpp>
	29	#include <boost/compute/memory/local_buffer.hpp>
	30	#include <boost/compute/type_traits/result_of.hpp>
92f5a8d4	31	#include <boost/compute/type_traits/is_device_iterator.hpp>
7c673cae FG	32
	33	namespace boost {
	34	namespace compute {
	35	namespace detail {
	36
	37	template<class InputIterator, class OutputIterator, class BinaryFunction>
	38	size_t reduce(InputIterator first,
	39	size_t count,
	40	OutputIterator result,
	41	size_t block_size,
	42	BinaryFunction function,
	43	command_queue &queue)
	44	{
	45	typedef typename
	46	std::iterator_traits<InputIterator>::value_type
	47	input_type;
	48	typedef typename
	49	boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
	50	result_type;
	51
	52	const context &context = queue.get_context();
	53	size_t block_count = count / 2 / block_size;
	54	size_t total_block_count =
	55	static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
	56
	57	if(block_count != 0){
	58	meta_kernel k("block_reduce");
	59	size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
	60	size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
	61
	62	k <<
	63	"const uint gid = get_global_id(0);\n" <<
	64	"const uint lid = get_local_id(0);\n" <<
	65
	66	// copy values to local memory
	67	"block[lid] = " <<
	68	function(first[k.make_var<uint_>("gid*2+0")],
	69	first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
	70
	71	// perform reduction
	72	"for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
	73	" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
	74	" uint mask = (i << 1) - 1;\n" <<
	75	" if((lid & mask) == 0){\n" <<
	76	" block[lid] = " <<
	77	function(k.expr<input_type>("block[lid]"),
	78	k.expr<input_type>("block[lid+i]")) << ";\n" <<
	79	" }\n" <<
	80	"}\n" <<
	81
	82	// write block result to global output
	83	"if(lid == 0)\n" <<
	84	" output[get_group_id(0)] = block[0];\n";
	85
	86	kernel kernel = k.compile(context);
	87	kernel.set_arg(output_arg, result.get_buffer());
	88	kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
	89
	90	queue.enqueue_1d_range_kernel(kernel,
	91	0,
	92	block_count * block_size,
	93	block_size);
	94	}
	95
96	// serially reduce any leftovers
97	if(block_count * block_size * 2 < count){
98	size_t last_block_start = block_count * block_size * 2;
99
100	meta_kernel k("extra_serial_reduce");
101	size_t count_arg = k.add_arg<uint_>("count");
102	size_t offset_arg = k.add_arg<uint_>("offset");
103	size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
104	size_t output_offset_arg = k.add_arg<uint_>("output_offset");
105
106	k <<
107	k.decl<result_type>("result") << " = \n" <<
108	first[k.expr<uint_>("offset")] << ";\n" <<
109	"for(uint i = offset + 1; i < count; i++)\n" <<
110	" result = " <<
111	function(k.var<result_type>("result"),
112	first[k.var<uint_>("i")]) << ";\n" <<
113	"output[output_offset] = result;\n";
114
115	kernel kernel = k.compile(context);
116	kernel.set_arg(count_arg, static_cast<uint_>(count));
117	kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
118	kernel.set_arg(output_arg, result.get_buffer());
119	kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
120
121	queue.enqueue_task(kernel);
122	}
123
124	return total_block_count;
125	}
126
127	template<class InputIterator, class BinaryFunction>
128	inline vector<
129	typename boost::compute::result_of<
130	BinaryFunction(
131	typename std::iterator_traits<InputIterator>::value_type,
132	typename std::iterator_traits<InputIterator>::value_type
133	)
134	>::type
135	>
136	block_reduce(InputIterator first,
137	size_t count,
138	size_t block_size,
139	BinaryFunction function,
140	command_queue &queue)
141	{
142	typedef typename
143	std::iterator_traits<InputIterator>::value_type
144	input_type;
145	typedef typename
146	boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
147	result_type;
148
149	const context &context = queue.get_context();
150	size_t total_block_count =
151	static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
152	vector<result_type> result_vector(total_block_count, context);
153
154	reduce(first, count, result_vector.begin(), block_size, function, queue);
155
156	return result_vector;
157	}
158
b32b8144	159	// Space complexity: O( ceil(n / 2 / 256) )
7c673cae FG	160	template<class InputIterator, class OutputIterator, class BinaryFunction>
	161	inline void generic_reduce(InputIterator first,
	162	InputIterator last,
	163	OutputIterator result,
	164	BinaryFunction function,
	165	command_queue &queue)
	166	{
	167	typedef typename
	168	std::iterator_traits<InputIterator>::value_type
	169	input_type;
	170	typedef typename
	171	boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
	172	result_type;
	173
	174	const device &device = queue.get_device();
	175	const context &context = queue.get_context();
	176
	177	size_t count = detail::iterator_range_size(first, last);
	178
	179	if(device.type() & device::cpu){
	180	array<result_type, 1> value(context);
	181	detail::reduce_on_cpu(first, last, value.begin(), function, queue);
	182	boost::compute::copy_n(value.begin(), 1, result, queue);
	183	}
	184	else {
	185	size_t block_size = 256;
	186
	187	// first pass
	188	vector<result_type> results = detail::block_reduce(first,
	189	count,
	190	block_size,
	191	function,
	192	queue);
	193
	194	if(results.size() > 1){
	195	detail::inplace_reduce(results.begin(),
	196	results.end(),
	197	function,
	198	queue);
	199	}
	200
	201	boost::compute::copy_n(results.begin(), 1, result, queue);
	202	}
	203	}
	204
	205	template<class InputIterator, class OutputIterator, class T>
	206	inline void dispatch_reduce(InputIterator first,
	207	InputIterator last,
	208	OutputIterator result,
	209	const plus<T> &function,
	210	command_queue &queue)
	211	{
	212	const context &context = queue.get_context();
	213	const device &device = queue.get_device();
	214
	215	// reduce to temporary buffer on device
	216	array<T, 1> value(context);
	217	if(device.type() & device::cpu){
	218	detail::reduce_on_cpu(first, last, value.begin(), function, queue);
	219	}
	220	else {
	221	reduce_on_gpu(first, last, value.begin(), function, queue);
	222	}
	223
224	// copy to result iterator
225	copy_n(value.begin(), 1, result, queue);
226	}
227
228	template<class InputIterator, class OutputIterator, class BinaryFunction>
229	inline void dispatch_reduce(InputIterator first,
230	InputIterator last,
231	OutputIterator result,
232	BinaryFunction function,
233	command_queue &queue)
234	{
235	generic_reduce(first, last, result, function, queue);
236	}
237
238	} // end detail namespace
239
240	/// Returns the result of applying \p function to the elements in the
241	/// range [\p first, \p last).
242	///
243	/// If no function is specified, \c plus will be used.
244	///
245	/// \param first first element in the input range
246	/// \param last last element in the input range
247	/// \param result iterator pointing to the output
248	/// \param function binary reduction function
249	/// \param queue command queue to perform the operation
250	///
251	/// The \c reduce() algorithm assumes that the binary reduction function is
252	/// associative. When used with non-associative functions the result may
253	/// be non-deterministic and vary in precision. Notably this affects the
254	/// \c plus<float>() function as floating-point addition is not associative
255	/// and may produce slightly different results than a serial algorithm.
256	///
257	/// This algorithm supports both host and device iterators for the
258	/// result argument. This allows for values to be reduced and copied
259	/// to the host all with a single function call.
260	///
261	/// For example, to calculate the sum of the values in a device vector and
262	/// copy the result to a value on the host:
263	///
264	/// \snippet test/test_reduce.cpp sum_int
265	///
266	/// Note that while the the \c reduce() algorithm is conceptually identical to
267	/// the \c accumulate() algorithm, its implementation is substantially more
268	/// efficient on parallel hardware. For more information, see the documentation
269	/// on the \c accumulate() algorithm.
270	///
b32b8144 FG	271	/// Space complexity on GPUs: \Omega(n)<br>
	272	/// Space complexity on CPUs: \Omega(1)
	273	///
7c673cae FG	274	/// \see accumulate()
	275	template<class InputIterator, class OutputIterator, class BinaryFunction>
	276	inline void reduce(InputIterator first,
	277	InputIterator last,
	278	OutputIterator result,
	279	BinaryFunction function,
	280	command_queue &queue = system::default_queue())
	281	{
92f5a8d4	282	BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
7c673cae FG	283	if(first == last){
	284	return;
	285	}
	286
	287	detail::dispatch_reduce(first, last, result, function, queue);
	288	}
	289
	290	/// \overload
	291	template<class InputIterator, class OutputIterator>
	292	inline void reduce(InputIterator first,
	293	InputIterator last,
	294	OutputIterator result,
	295	command_queue &queue = system::default_queue())
	296	{
92f5a8d4	297	BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
7c673cae FG	298	typedef typename std::iterator_traits<InputIterator>::value_type T;
	299
	300	if(first == last){
	301	return;
	302	}
	303
	304	detail::dispatch_reduce(first, last, result, plus<T>(), queue);
	305	}
	306
	307	} // end compute namespace
	308	} // end boost namespace
	309
	310	#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP