]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "benchmark/benchmark.h"
22 #include "arrow/array/builder_binary.h"
23 #include "arrow/memory_pool.h"
24 #include "arrow/testing/gtest_util.h"
25 #include "arrow/testing/random.h"
26 #include "arrow/testing/util.h"
28 #include "arrow/compute/api.h"
33 static void BuildDictionary(benchmark::State
& state
) { // NOLINT non-const reference
34 const int64_t iterations
= 1024;
36 std::vector
<int64_t> values
;
37 std::vector
<bool> is_valid
;
38 for (int64_t i
= 0; i
< iterations
; i
++) {
39 for (int64_t j
= 0; j
< i
; j
++) {
40 is_valid
.push_back((i
+ j
) % 9 != 0);
45 std::shared_ptr
<Array
> arr
;
46 ArrayFromVector
<Int64Type
, int64_t>(is_valid
, values
, &arr
);
48 while (state
.KeepRunning()) {
49 ABORT_NOT_OK(DictionaryEncode(arr
).status());
51 state
.counters
["null_percent"] =
52 static_cast<double>(arr
->null_count()) / arr
->length() * 100;
53 state
.SetBytesProcessed(state
.iterations() * values
.size() * sizeof(int64_t));
54 state
.SetItemsProcessed(state
.iterations() * values
.size());
57 static void BuildStringDictionary(
58 benchmark::State
& state
) { // NOLINT non-const reference
59 const int64_t iterations
= 1024 * 64;
61 std::vector
<std::string
> data
;
63 int64_t total_bytes
= 0;
64 for (int64_t i
= 0; i
< iterations
; i
++) {
69 total_bytes
+= static_cast<int64_t>(val
.size());
72 std::shared_ptr
<Array
> arr
;
73 ArrayFromVector
<StringType
, std::string
>(data
, &arr
);
75 while (state
.KeepRunning()) {
76 ABORT_NOT_OK(DictionaryEncode(arr
).status());
78 state
.SetBytesProcessed(state
.iterations() * total_bytes
);
79 state
.SetItemsProcessed(state
.iterations() * data
.size());
82 struct HashBenchCase
{
85 double null_probability
;
88 template <typename Type
>
90 using T
= typename
Type::c_type
;
94 void GenerateTestData(std::shared_ptr
<Array
>* arr
) const {
95 std::vector
<int64_t> draws
;
96 std::vector
<T
> values
;
97 std::vector
<bool> is_valid
;
98 randint
<int64_t>(params
.length
, 0, params
.num_unique
, &draws
);
99 for (int64_t draw
: draws
) {
100 values
.push_back(static_cast<T
>(draw
));
102 if (params
.null_probability
> 0) {
103 random_is_valid(params
.length
, params
.null_probability
, &is_valid
);
104 ArrayFromVector
<Type
, T
>(is_valid
, values
, arr
);
106 ArrayFromVector
<Type
, T
>(values
, arr
);
110 void SetMetadata(benchmark::State
& state
) const {
111 state
.counters
["null_percent"] = params
.null_probability
* 100;
112 state
.counters
["num_unique"] = static_cast<double>(params
.num_unique
);
113 state
.SetBytesProcessed(state
.iterations() * params
.length
* sizeof(T
));
114 state
.SetItemsProcessed(state
.iterations() * params
.length
);
119 struct HashParams
<StringType
> {
120 HashBenchCase params
;
122 void GenerateTestData(std::shared_ptr
<Array
>* arr
) const {
123 std::vector
<int64_t> draws
;
124 randint
<int64_t>(params
.length
, 0, params
.num_unique
, &draws
);
126 const int64_t total_bytes
= this->byte_width
* params
.num_unique
;
127 std::vector
<uint8_t> uniques(total_bytes
);
128 const uint32_t seed
= 0;
129 random_bytes(total_bytes
, seed
, uniques
.data());
131 std::vector
<bool> is_valid
;
132 if (params
.null_probability
> 0) {
133 random_is_valid(params
.length
, params
.null_probability
, &is_valid
);
136 StringBuilder builder
;
137 for (int64_t i
= 0; i
< params
.length
; ++i
) {
138 if (params
.null_probability
== 0 || is_valid
[i
]) {
139 ABORT_NOT_OK(builder
.Append(uniques
.data() + this->byte_width
* draws
[i
],
142 ABORT_NOT_OK(builder
.AppendNull());
145 ABORT_NOT_OK(builder
.Finish(arr
));
148 void SetMetadata(benchmark::State
& state
) const {
149 state
.counters
["null_percent"] = params
.null_probability
* 100;
150 state
.counters
["num_unique"] = static_cast<double>(params
.num_unique
);
151 state
.SetBytesProcessed(state
.iterations() * params
.length
* byte_width
);
152 state
.SetItemsProcessed(state
.iterations() * params
.length
);
156 template <typename ParamType
>
157 void BenchUnique(benchmark::State
& state
, const ParamType
& params
) {
158 std::shared_ptr
<Array
> arr
;
159 params
.GenerateTestData(&arr
);
161 while (state
.KeepRunning()) {
162 ABORT_NOT_OK(Unique(arr
).status());
164 params
.SetMetadata(state
);
167 template <typename ParamType
>
168 void BenchDictionaryEncode(benchmark::State
& state
, const ParamType
& params
) {
169 std::shared_ptr
<Array
> arr
;
170 params
.GenerateTestData(&arr
);
171 while (state
.KeepRunning()) {
172 ABORT_NOT_OK(DictionaryEncode(arr
).status());
174 params
.SetMetadata(state
);
177 constexpr int kHashBenchmarkLength
= 1 << 22;
180 std::vector
<HashBenchCase
> uint8_bench_cases
= {
181 {kHashBenchmarkLength
, 200, 0},
182 {kHashBenchmarkLength
, 200, 0.001},
183 {kHashBenchmarkLength
, 200, 0.01},
184 {kHashBenchmarkLength
, 200, 0.1},
185 {kHashBenchmarkLength
, 200, 0.5},
186 {kHashBenchmarkLength
, 200, 0.99},
187 {kHashBenchmarkLength
, 200, 1}
191 static void UniqueUInt8(benchmark::State
& state
) {
192 BenchUnique(state
, HashParams
<UInt8Type
>{uint8_bench_cases
[state
.range(0)]});
196 std::vector
<HashBenchCase
> general_bench_cases
= {
197 {kHashBenchmarkLength
, 100, 0},
198 {kHashBenchmarkLength
, 100, 0.001},
199 {kHashBenchmarkLength
, 100, 0.01},
200 {kHashBenchmarkLength
, 100, 0.1},
201 {kHashBenchmarkLength
, 100, 0.5},
202 {kHashBenchmarkLength
, 100, 0.99},
203 {kHashBenchmarkLength
, 100, 1},
204 {kHashBenchmarkLength
, 100000, 0},
205 {kHashBenchmarkLength
, 100000, 0.001},
206 {kHashBenchmarkLength
, 100000, 0.01},
207 {kHashBenchmarkLength
, 100000, 0.1},
208 {kHashBenchmarkLength
, 100000, 0.5},
209 {kHashBenchmarkLength
, 100000, 0.99},
210 {kHashBenchmarkLength
, 100000, 1},
214 static void UniqueInt64(benchmark::State
& state
) {
215 BenchUnique(state
, HashParams
<Int64Type
>{general_bench_cases
[state
.range(0)]});
218 static void UniqueString10bytes(benchmark::State
& state
) {
219 // Byte strings with 10 bytes each
220 BenchUnique(state
, HashParams
<StringType
>{general_bench_cases
[state
.range(0)], 10});
223 static void UniqueString100bytes(benchmark::State
& state
) {
224 // Byte strings with 100 bytes each
225 BenchUnique(state
, HashParams
<StringType
>{general_bench_cases
[state
.range(0)], 100});
228 void HashSetArgs(benchmark::internal::Benchmark
* bench
) {
229 for (int i
= 0; i
< static_cast<int>(general_bench_cases
.size()); ++i
) {
234 BENCHMARK(BuildDictionary
);
235 BENCHMARK(BuildStringDictionary
);
237 BENCHMARK(UniqueInt64
)->Apply(HashSetArgs
);
238 BENCHMARK(UniqueString10bytes
)->Apply(HashSetArgs
);
239 BENCHMARK(UniqueString100bytes
)->Apply(HashSetArgs
);
241 void UInt8SetArgs(benchmark::internal::Benchmark
* bench
) {
242 for (int i
= 0; i
< static_cast<int>(uint8_bench_cases
.size()); ++i
) {
247 BENCHMARK(UniqueUInt8
)->Apply(UInt8SetArgs
);
249 } // namespace compute