[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / scan_on_cpu.hpp

//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP

#include <iterator>

#include <boost/compute/device.hpp>
#include <boost/compute/kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/serial_scan.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  T init,
                                  BinaryOperator op,
                                  command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();
    const size_t compute_units = queue.get_device().compute_units();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    std::string cache_key =
        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));

    // for inputs smaller than serial_scan_threshold
    // serial_scan algorithm is used
    uint_ serial_scan_threshold =
        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
    serial_scan_threshold =
        (std::max)(serial_scan_threshold, uint_(compute_units));

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return result;
    }
    else if(count < serial_scan_threshold) {
        return serial_scan(first, last, result, exclusive, init, op, queue);
    }

    buffer block_partial_sums(context, sizeof(output_type) * compute_units );

    // create scan kernel
    meta_kernel k("scan_on_cpu_block_scan");

    // Arguments
    size_t count_arg = k.add_arg<uint_>("count");
    size_t init_arg = k.add_arg<output_type>("initial_value");
    size_t block_partial_sums_arg =
        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");

    k <<
        "uint block = " <<
            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
        "uint index = get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n";

    if(!exclusive){
        k <<
            k.decl<output_type>("sum") << " = " <<
                first[k.var<uint_>("index")] << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n" <<
            "index++;\n";
    }
    else {
        k <<
            k.decl<output_type>("sum") << ";\n" <<
            "if(index == 0){\n" <<
                "sum = initial_value;\n" <<
            "}\n" <<
            "else {\n" <<
                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
                "index++;\n" <<
            "}\n";
    }

    k <<
        "while(index < end){\n" <<
            // load next value
            k.decl<const input_type>("value") << " = "
                << first[k.var<uint_>("index")] << ";\n";

    if(exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }
    k <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("value")) << ";\n";

    if(!exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }

    k <<
            "index++;\n" <<
        "}\n" << // end while
        "block_partial_sums[get_global_id(0)] = sum;\n";

    // compile scan kernel
    kernel block_scan_kernel = k.compile(context);

    // setup kernel arguments
    block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
    block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
    block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);

    // execute the kernel
    size_t global_work_size = compute_units;
    queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);

    // scan is done
    if(compute_units < 2) {
        return result + count;
    }

    // final scan kernel
    meta_kernel l("scan_on_cpu_final_scan");

    // Arguments
    count_arg = l.add_arg<uint_>("count");
    block_partial_sums_arg =
        l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");

    l <<
        "uint block = " <<
            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
        "uint index = block + get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n" <<

        k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
        "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
        "}\n" <<

        "while(index < end){\n";
    if(exclusive){
        l <<
            l.decl<output_type>("value") << " = "
                << first[k.var<uint_>("index")] << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n" <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("value")) << ";\n";
    }
    else {
        l <<
            "sum = " << op(k.var<output_type>("sum"),
                           first[k.var<uint_>("index")]) << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n";
    }
    l <<
            "index++;\n" <<
        "}\n";


    // compile scan kernel
    kernel final_scan_kernel = l.compile(context);

    // setup kernel arguments
    final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
    final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);

    // execute the kernel
    global_work_size = compute_units;
    queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);

    // return iterator pointing to the end of the result range
    return result + count;
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
Commit	Line	Data
7c673cae FG	1	//---------------------------------------------------------------------------//
	2	// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
	3	//
	4	// Distributed under the Boost Software License, Version 1.0
	5	// See accompanying file LICENSE_1_0.txt or copy at
	6	// http://www.boost.org/LICENSE_1_0.txt
	7	//
	8	// See http://boostorg.github.com/compute for more information.
	9	//---------------------------------------------------------------------------//
	10
	11	#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
	12	#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
	13
	14	#include <iterator>
	15
	16	#include <boost/compute/device.hpp>
	17	#include <boost/compute/kernel.hpp>
	18	#include <boost/compute/command_queue.hpp>
	19	#include <boost/compute/algorithm/detail/serial_scan.hpp>
	20	#include <boost/compute/detail/meta_kernel.hpp>
	21	#include <boost/compute/detail/iterator_range_size.hpp>
	22	#include <boost/compute/detail/parameter_cache.hpp>
	23
	24	namespace boost {
	25	namespace compute {
	26	namespace detail {
	27
	28	template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
	29	inline OutputIterator scan_on_cpu(InputIterator first,
	30	InputIterator last,
	31	OutputIterator result,
	32	bool exclusive,
	33	T init,
	34	BinaryOperator op,
	35	command_queue &queue)
	36	{
	37	typedef typename
	38	std::iterator_traits<InputIterator>::value_type input_type;
	39	typedef typename
	40	std::iterator_traits<OutputIterator>::value_type output_type;
	41
	42	const context &context = queue.get_context();
	43	const device &device = queue.get_device();
	44	const size_t compute_units = queue.get_device().compute_units();
	45
	46	boost::shared_ptr<parameter_cache> parameters =
	47	detail::parameter_cache::get_global_cache(device);
	48
	49	std::string cache_key =
	50	"__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
	51
	52	// for inputs smaller than serial_scan_threshold
	53	// serial_scan algorithm is used
	54	uint_ serial_scan_threshold =
	55	parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
	56	serial_scan_threshold =
	57	(std::max)(serial_scan_threshold, uint_(compute_units));
	58
	59	size_t count = detail::iterator_range_size(first, last);
	60	if(count == 0){
	61	return result;
	62	}
	63	else if(count < serial_scan_threshold) {
	64	return serial_scan(first, last, result, exclusive, init, op, queue);
65	}
66
67	buffer block_partial_sums(context, sizeof(output_type) * compute_units );
68
69	// create scan kernel
70	meta_kernel k("scan_on_cpu_block_scan");
71
72	// Arguments
73	size_t count_arg = k.add_arg<uint_>("count");
74	size_t init_arg = k.add_arg<output_type>("initial_value");
75	size_t block_partial_sums_arg =
76	k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
77
78	k <<
79	"uint block = " <<
80	"(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
81	"uint index = get_global_id(0) * block;\n" <<
82	"uint end = min(count, index + block);\n";
83
84	if(!exclusive){
85	k <<
86	k.decl<output_type>("sum") << " = " <<
87	first[k.var<uint_>("index")] << ";\n" <<
88	result[k.var<uint_>("index")] << " = sum;\n" <<
89	"index++;\n";
90	}
91	else {
92	k <<
93	k.decl<output_type>("sum") << ";\n" <<
94	"if(index == 0){\n" <<
95	"sum = initial_value;\n" <<
96	"}\n" <<
97	"else {\n" <<
98	"sum = " << first[k.var<uint_>("index")] << ";\n" <<
99	"index++;\n" <<
100	"}\n";
101	}
102
103	k <<
104	"while(index < end){\n" <<
105	// load next value
106	k.decl<const input_type>("value") << " = "
107	<< first[k.var<uint_>("index")] << ";\n";
108
109	if(exclusive){
110	k <<
111	"if(get_global_id(0) == 0){\n" <<
112	result[k.var<uint_>("index")] << " = sum;\n" <<
113	"}\n";
114	}
115	k <<
116	"sum = " << op(k.var<output_type>("sum"),
117	k.var<output_type>("value")) << ";\n";
118
119	if(!exclusive){
120	k <<
121	"if(get_global_id(0) == 0){\n" <<
122	result[k.var<uint_>("index")] << " = sum;\n" <<
123	"}\n";
124	}
125
126	k <<
127	"index++;\n" <<
128	"}\n" << // end while
129	"block_partial_sums[get_global_id(0)] = sum;\n";
130
131	// compile scan kernel
132	kernel block_scan_kernel = k.compile(context);
133
134	// setup kernel arguments
135	block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
136	block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
137	block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
138
139	// execute the kernel
140	size_t global_work_size = compute_units;
141	queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
142
143	// scan is done
144	if(compute_units < 2) {
145	return result + count;
146	}
147
148	// final scan kernel
149	meta_kernel l("scan_on_cpu_final_scan");
150
151	// Arguments
152	count_arg = l.add_arg<uint_>("count");
153	block_partial_sums_arg =
154	l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
155
156	l <<
157	"uint block = " <<
158	"(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
159	"uint index = block + get_global_id(0) * block;\n" <<
160	"uint end = min(count, index + block);\n" <<
161
162	k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
163	"for(uint i = 0; i < get_global_id(0); i++) {\n" <<
164	"sum = " << op(k.var<output_type>("sum"),
165	k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
166	"}\n" <<
167
168	"while(index < end){\n";
169	if(exclusive){
170	l <<
171	l.decl<output_type>("value") << " = "
172	<< first[k.var<uint_>("index")] << ";\n" <<
173	result[k.var<uint_>("index")] << " = sum;\n" <<
174	"sum = " << op(k.var<output_type>("sum"),
175	k.var<output_type>("value")) << ";\n";
176	}
177	else {
178	l <<
179	"sum = " << op(k.var<output_type>("sum"),
180	first[k.var<uint_>("index")]) << ";\n" <<
181	result[k.var<uint_>("index")] << " = sum;\n";
182	}
183	l <<
184	"index++;\n" <<
185	"}\n";
186
187
188	// compile scan kernel
189	kernel final_scan_kernel = l.compile(context);
190
191	// setup kernel arguments
192	final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
193	final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
194
195	// execute the kernel
196	global_work_size = compute_units;
197	queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
198
199	// return iterator pointing to the end of the result range
200	return result + count;
201	}
202
203	} // end detail namespace
204	} // end compute namespace
205	} // end boost namespace
206
207	#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP