]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / detail / find_if_with_atomics.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
13
14 #include <iterator>
15
16 #include <boost/compute/types.hpp>
17 #include <boost/compute/functional.hpp>
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/container/detail/scalar.hpp>
20 #include <boost/compute/iterator/buffer_iterator.hpp>
21 #include <boost/compute/type_traits/type_name.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/detail/iterator_range_size.hpp>
24 #include <boost/compute/detail/parameter_cache.hpp>
25
26 namespace boost {
27 namespace compute {
28 namespace detail {
29
30 template<class InputIterator, class UnaryPredicate>
31 inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
32 InputIterator last,
33 UnaryPredicate predicate,
34 const size_t count,
35 command_queue &queue)
36 {
37 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
38 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
39
40 const context &context = queue.get_context();
41
42 detail::meta_kernel k("find_if");
43 size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
44 atomic_min<uint_> atomic_min_uint;
45
46 k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
47 << k.decl<const value_type>("value") << "="
48 << first[k.var<const uint_>("i")] << ";\n"
49 << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
50 << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
51 << "}\n";
52
53 kernel kernel = k.compile(context);
54
55 scalar<uint_> index(context);
56 kernel.set_arg(index_arg, index.get_buffer());
57
58 // initialize index to the last iterator's index
59 index.write(static_cast<uint_>(count), queue);
60 queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
61
62 // read index and return iterator
63 return first + static_cast<difference_type>(index.read(queue));
64 }
65
66 template<class InputIterator, class UnaryPredicate>
67 inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
68 InputIterator last,
69 UnaryPredicate predicate,
70 const size_t count,
71 const size_t vpt,
72 command_queue &queue)
73 {
74 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
75 typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
76
77 const context &context = queue.get_context();
78 const device &device = queue.get_device();
79
80 detail::meta_kernel k("find_if");
81 size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
82 size_t count_arg = k.add_arg<const uint_>("count");
83 size_t vpt_arg = k.add_arg<const uint_>("vpt");
84 atomic_min<uint_> atomic_min_uint;
85
86 // for GPUs reads from global memory are coalesced
87 if(device.type() & device::gpu) {
88 k <<
89 k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
90 k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
91 k.decl<const uint_>("end") << " = min(" <<
92 "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
93 "count" <<
94 ");\n" <<
95
96 // checking if the index is already found
97 "__local uint local_index;\n" <<
98 "if(get_local_id(0) == 0){\n" <<
99 " local_index = *index;\n " <<
100 "};\n" <<
101 "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
102 "if(local_index < id){\n" <<
103 " return;\n" <<
104 "}\n" <<
105
106 "while(id < end){\n" <<
107 " " << k.decl<const value_type>("value") << " = " <<
108 first[k.var<const uint_>("id")] << ";\n"
109 " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
110 " " << atomic_min_uint(k.var<uint_ *>("index"),
111 k.var<uint_>("id")) << ";\n" <<
112 " return;\n"
113 " }\n" <<
114 " id+=lsize;\n" <<
115 "}\n";
116 // for CPUs (and other devices) reads are ordered so the big cache is
117 // efficiently used.
118 } else {
119 k <<
120 k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
121 k.decl<const uint_>("end") << " = min(" <<
122 "id + " << k.var<uint_>("vpt") << "," <<
123 "count" <<
124 ");\n" <<
125 "while(id < end && (*index) > id){\n" <<
126 " " << k.decl<const value_type>("value") << " = " <<
127 first[k.var<const uint_>("id")] << ";\n"
128 " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
129 " " << atomic_min_uint(k.var<uint_ *>("index"),
130 k.var<uint_>("id")) << ";\n" <<
131 " return;\n" <<
132 " }\n" <<
133 " id++;\n" <<
134 "}\n";
135 }
136
137 kernel kernel = k.compile(context);
138
139 scalar<uint_> index(context);
140 kernel.set_arg(index_arg, index.get_buffer());
141 kernel.set_arg(count_arg, static_cast<uint_>(count));
142 kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
143
144 // initialize index to the last iterator's index
145 index.write(static_cast<uint_>(count), queue);
146
147 const size_t global_wg_size = static_cast<size_t>(
148 std::ceil(float(count) / vpt)
149 );
150 queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
151
152 // read index and return iterator
153 return first + static_cast<difference_type>(index.read(queue));
154 }
155
156 template<class InputIterator, class UnaryPredicate>
157 inline InputIterator find_if_with_atomics(InputIterator first,
158 InputIterator last,
159 UnaryPredicate predicate,
160 command_queue &queue)
161 {
162 typedef typename std::iterator_traits<InputIterator>::value_type value_type;
163
164 size_t count = detail::iterator_range_size(first, last);
165 if(count == 0){
166 return last;
167 }
168
169 const device &device = queue.get_device();
170
171 // load cached parameters
172 std::string cache_key = std::string("__boost_find_if_with_atomics_")
173 + type_name<value_type>();
174 boost::shared_ptr<parameter_cache> parameters =
175 detail::parameter_cache::get_global_cache(device);
176
177 // for relatively small inputs on GPUs kernel checking one value per thread
178 // (work-item) is more efficient than its multiple values per thread version
179 if(device.type() & device::gpu){
180 const size_t one_vpt_threshold =
181 parameters->get(cache_key, "one_vpt_threshold", 1048576);
182 if(count <= one_vpt_threshold){
183 return find_if_with_atomics_one_vpt(
184 first, last, predicate, count, queue
185 );
186 }
187 }
188
189 // values per thread
190 size_t vpt;
191 if(device.type() & device::gpu){
192 // get vpt parameter
193 vpt = parameters->get(cache_key, "vpt", 32);
194 } else {
195 // for CPUs work is split equally between compute units
196 const size_t max_compute_units =
197 device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
198 vpt = static_cast<size_t>(
199 std::ceil(float(count) / max_compute_units)
200 );
201 }
202
203 return find_if_with_atomics_multiple_vpt(
204 first, last, predicate, count, vpt, queue
205 );
206 }
207
208 } // end detail namespace
209 } // end compute namespace
210 } // end boost namespace
211
212 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP