]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / compute / kernels / vector_hash_benchmark.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include <vector>
21
22 #include "arrow/array/builder_binary.h"
23 #include "arrow/memory_pool.h"
24 #include "arrow/testing/gtest_util.h"
25 #include "arrow/testing/random.h"
26 #include "arrow/testing/util.h"
27
28 #include "arrow/compute/api.h"
29
30 namespace arrow {
31 namespace compute {
32
33 static void BuildDictionary(benchmark::State& state) { // NOLINT non-const reference
34 const int64_t iterations = 1024;
35
36 std::vector<int64_t> values;
37 std::vector<bool> is_valid;
38 for (int64_t i = 0; i < iterations; i++) {
39 for (int64_t j = 0; j < i; j++) {
40 is_valid.push_back((i + j) % 9 != 0);
41 values.push_back(j);
42 }
43 }
44
45 std::shared_ptr<Array> arr;
46 ArrayFromVector<Int64Type, int64_t>(is_valid, values, &arr);
47
48 while (state.KeepRunning()) {
49 ABORT_NOT_OK(DictionaryEncode(arr).status());
50 }
51 state.counters["null_percent"] =
52 static_cast<double>(arr->null_count()) / arr->length() * 100;
53 state.SetBytesProcessed(state.iterations() * values.size() * sizeof(int64_t));
54 state.SetItemsProcessed(state.iterations() * values.size());
55 }
56
57 static void BuildStringDictionary(
58 benchmark::State& state) { // NOLINT non-const reference
59 const int64_t iterations = 1024 * 64;
60 // Pre-render strings
61 std::vector<std::string> data;
62
63 int64_t total_bytes = 0;
64 for (int64_t i = 0; i < iterations; i++) {
65 std::stringstream ss;
66 ss << i;
67 auto val = ss.str();
68 data.push_back(val);
69 total_bytes += static_cast<int64_t>(val.size());
70 }
71
72 std::shared_ptr<Array> arr;
73 ArrayFromVector<StringType, std::string>(data, &arr);
74
75 while (state.KeepRunning()) {
76 ABORT_NOT_OK(DictionaryEncode(arr).status());
77 }
78 state.SetBytesProcessed(state.iterations() * total_bytes);
79 state.SetItemsProcessed(state.iterations() * data.size());
80 }
81
82 struct HashBenchCase {
83 int64_t length;
84 int64_t num_unique;
85 double null_probability;
86 };
87
88 template <typename Type>
89 struct HashParams {
90 using T = typename Type::c_type;
91
92 HashBenchCase params;
93
94 void GenerateTestData(std::shared_ptr<Array>* arr) const {
95 std::vector<int64_t> draws;
96 std::vector<T> values;
97 std::vector<bool> is_valid;
98 randint<int64_t>(params.length, 0, params.num_unique, &draws);
99 for (int64_t draw : draws) {
100 values.push_back(static_cast<T>(draw));
101 }
102 if (params.null_probability > 0) {
103 random_is_valid(params.length, params.null_probability, &is_valid);
104 ArrayFromVector<Type, T>(is_valid, values, arr);
105 } else {
106 ArrayFromVector<Type, T>(values, arr);
107 }
108 }
109
110 void SetMetadata(benchmark::State& state) const {
111 state.counters["null_percent"] = params.null_probability * 100;
112 state.counters["num_unique"] = static_cast<double>(params.num_unique);
113 state.SetBytesProcessed(state.iterations() * params.length * sizeof(T));
114 state.SetItemsProcessed(state.iterations() * params.length);
115 }
116 };
117
118 template <>
119 struct HashParams<StringType> {
120 HashBenchCase params;
121 int32_t byte_width;
122 void GenerateTestData(std::shared_ptr<Array>* arr) const {
123 std::vector<int64_t> draws;
124 randint<int64_t>(params.length, 0, params.num_unique, &draws);
125
126 const int64_t total_bytes = this->byte_width * params.num_unique;
127 std::vector<uint8_t> uniques(total_bytes);
128 const uint32_t seed = 0;
129 random_bytes(total_bytes, seed, uniques.data());
130
131 std::vector<bool> is_valid;
132 if (params.null_probability > 0) {
133 random_is_valid(params.length, params.null_probability, &is_valid);
134 }
135
136 StringBuilder builder;
137 for (int64_t i = 0; i < params.length; ++i) {
138 if (params.null_probability == 0 || is_valid[i]) {
139 ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * draws[i],
140 this->byte_width));
141 } else {
142 ABORT_NOT_OK(builder.AppendNull());
143 }
144 }
145 ABORT_NOT_OK(builder.Finish(arr));
146 }
147
148 void SetMetadata(benchmark::State& state) const {
149 state.counters["null_percent"] = params.null_probability * 100;
150 state.counters["num_unique"] = static_cast<double>(params.num_unique);
151 state.SetBytesProcessed(state.iterations() * params.length * byte_width);
152 state.SetItemsProcessed(state.iterations() * params.length);
153 }
154 };
155
156 template <typename ParamType>
157 void BenchUnique(benchmark::State& state, const ParamType& params) {
158 std::shared_ptr<Array> arr;
159 params.GenerateTestData(&arr);
160
161 while (state.KeepRunning()) {
162 ABORT_NOT_OK(Unique(arr).status());
163 }
164 params.SetMetadata(state);
165 }
166
167 template <typename ParamType>
168 void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) {
169 std::shared_ptr<Array> arr;
170 params.GenerateTestData(&arr);
171 while (state.KeepRunning()) {
172 ABORT_NOT_OK(DictionaryEncode(arr).status());
173 }
174 params.SetMetadata(state);
175 }
176
177 constexpr int kHashBenchmarkLength = 1 << 22;
178
179 // clang-format off
180 std::vector<HashBenchCase> uint8_bench_cases = {
181 {kHashBenchmarkLength, 200, 0},
182 {kHashBenchmarkLength, 200, 0.001},
183 {kHashBenchmarkLength, 200, 0.01},
184 {kHashBenchmarkLength, 200, 0.1},
185 {kHashBenchmarkLength, 200, 0.5},
186 {kHashBenchmarkLength, 200, 0.99},
187 {kHashBenchmarkLength, 200, 1}
188 };
189 // clang-format on
190
191 static void UniqueUInt8(benchmark::State& state) {
192 BenchUnique(state, HashParams<UInt8Type>{uint8_bench_cases[state.range(0)]});
193 }
194
195 // clang-format off
196 std::vector<HashBenchCase> general_bench_cases = {
197 {kHashBenchmarkLength, 100, 0},
198 {kHashBenchmarkLength, 100, 0.001},
199 {kHashBenchmarkLength, 100, 0.01},
200 {kHashBenchmarkLength, 100, 0.1},
201 {kHashBenchmarkLength, 100, 0.5},
202 {kHashBenchmarkLength, 100, 0.99},
203 {kHashBenchmarkLength, 100, 1},
204 {kHashBenchmarkLength, 100000, 0},
205 {kHashBenchmarkLength, 100000, 0.001},
206 {kHashBenchmarkLength, 100000, 0.01},
207 {kHashBenchmarkLength, 100000, 0.1},
208 {kHashBenchmarkLength, 100000, 0.5},
209 {kHashBenchmarkLength, 100000, 0.99},
210 {kHashBenchmarkLength, 100000, 1},
211 };
212 // clang-format on
213
214 static void UniqueInt64(benchmark::State& state) {
215 BenchUnique(state, HashParams<Int64Type>{general_bench_cases[state.range(0)]});
216 }
217
218 static void UniqueString10bytes(benchmark::State& state) {
219 // Byte strings with 10 bytes each
220 BenchUnique(state, HashParams<StringType>{general_bench_cases[state.range(0)], 10});
221 }
222
223 static void UniqueString100bytes(benchmark::State& state) {
224 // Byte strings with 100 bytes each
225 BenchUnique(state, HashParams<StringType>{general_bench_cases[state.range(0)], 100});
226 }
227
228 void HashSetArgs(benchmark::internal::Benchmark* bench) {
229 for (int i = 0; i < static_cast<int>(general_bench_cases.size()); ++i) {
230 bench->Arg(i);
231 }
232 }
233
234 BENCHMARK(BuildDictionary);
235 BENCHMARK(BuildStringDictionary);
236
237 BENCHMARK(UniqueInt64)->Apply(HashSetArgs);
238 BENCHMARK(UniqueString10bytes)->Apply(HashSetArgs);
239 BENCHMARK(UniqueString100bytes)->Apply(HashSetArgs);
240
241 void UInt8SetArgs(benchmark::internal::Benchmark* bench) {
242 for (int i = 0; i < static_cast<int>(uint8_bench_cases.size()); ++i) {
243 bench->Arg(i);
244 }
245 }
246
247 BENCHMARK(UniqueUInt8)->Apply(UInt8SetArgs);
248
249 } // namespace compute
250 } // namespace arrow