]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/utf8_util_benchmark.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "benchmark/benchmark.h"
21 #include <type_traits>
24 #include "arrow/testing/gtest_util.h"
25 #include "arrow/util/utf8.h"
27 // Do not benchmark inlined functions directly inside the benchmark loop
28 static ARROW_NOINLINE
bool ValidateUTF8NoInline(const uint8_t* data
, int64_t size
) {
29 return ::arrow::util::ValidateUTF8(data
, size
);
32 static ARROW_NOINLINE
bool ValidateAsciiNoInline(const uint8_t* data
, int64_t size
) {
33 return ::arrow::util::ValidateAscii(data
, size
);
39 static const char* tiny_valid_ascii
= "characters";
40 static const char* tiny_valid_non_ascii
= "caractères";
42 static const char* valid_ascii
=
43 "UTF-8 is a variable width character encoding capable of encoding all 1,112,064 "
44 "valid code points in Unicode using one to four 8-bit bytes";
45 static const char* valid_almost_ascii
=
46 "UTF-8 est un codage de caractères informatiques conçu pour coder l’ensemble des "
47 "caractères du « répertoire universel de caractères codés »";
48 static const char* valid_non_ascii
=
49 "UTF-8 はISO/IEC 10646 (UCS) "
50 "とUnicodeで使える8ビット符号単位の文字符号化形式及び文字符号化スキーム。 ";
52 static std::string
MakeLargeString(const std::string
& base
, int64_t nbytes
) {
53 int64_t nrepeats
= (nbytes
+ base
.size() - 1) / base
.size();
55 s
.reserve(nrepeats
* nbytes
);
56 for (int64_t i
= 0; i
< nrepeats
; ++i
) {
62 static void BenchmarkUTF8Validation(
63 benchmark::State
& state
, // NOLINT non-const reference
64 const std::string
& s
, bool expected
) {
65 auto data
= reinterpret_cast<const uint8_t*>(s
.data());
66 auto data_size
= static_cast<int64_t>(s
.size());
69 bool b
= ValidateUTF8NoInline(data
, data_size
);
71 std::cerr
<< "Unexpected validation result" << std::endl
;
75 while (state
.KeepRunning()) {
76 bool b
= ValidateUTF8NoInline(data
, data_size
);
77 benchmark::DoNotOptimize(b
);
79 state
.SetBytesProcessed(state
.iterations() * s
.size());
82 static void BenchmarkASCIIValidation(
83 benchmark::State
& state
, // NOLINT non-const reference
84 const std::string
& s
, bool expected
) {
85 auto data
= reinterpret_cast<const uint8_t*>(s
.data());
86 auto data_size
= static_cast<int64_t>(s
.size());
88 bool b
= ValidateAsciiNoInline(data
, data_size
);
90 std::cerr
<< "Unexpected validation result" << std::endl
;
94 while (state
.KeepRunning()) {
95 bool b
= ValidateAsciiNoInline(data
, data_size
);
96 benchmark::DoNotOptimize(b
);
98 state
.SetBytesProcessed(state
.iterations() * s
.size());
101 static void ValidateTinyAscii(benchmark::State
& state
) { // NOLINT non-const reference
102 BenchmarkASCIIValidation(state
, tiny_valid_ascii
, true);
105 static void ValidateTinyNonAscii(benchmark::State
& state
) { // NOLINT non-const reference
106 BenchmarkUTF8Validation(state
, tiny_valid_non_ascii
, true);
109 static void ValidateSmallAscii(benchmark::State
& state
) { // NOLINT non-const reference
110 BenchmarkASCIIValidation(state
, valid_ascii
, true);
113 static void ValidateSmallAlmostAscii(
114 benchmark::State
& state
) { // NOLINT non-const reference
115 BenchmarkUTF8Validation(state
, valid_almost_ascii
, true);
118 static void ValidateSmallNonAscii(
119 benchmark::State
& state
) { // NOLINT non-const reference
120 BenchmarkUTF8Validation(state
, valid_non_ascii
, true);
123 static void ValidateLargeAscii(benchmark::State
& state
) { // NOLINT non-const reference
124 auto s
= MakeLargeString(valid_ascii
, 100000);
125 BenchmarkASCIIValidation(state
, s
, true);
128 static void ValidateLargeAlmostAscii(
129 benchmark::State
& state
) { // NOLINT non-const reference
130 auto s
= MakeLargeString(valid_almost_ascii
, 100000);
131 BenchmarkUTF8Validation(state
, s
, true);
134 static void ValidateLargeNonAscii(
135 benchmark::State
& state
) { // NOLINT non-const reference
136 auto s
= MakeLargeString(valid_non_ascii
, 100000);
137 BenchmarkUTF8Validation(state
, s
, true);
140 BENCHMARK(ValidateTinyAscii
);
141 BENCHMARK(ValidateTinyNonAscii
);
142 BENCHMARK(ValidateSmallAscii
);
143 BENCHMARK(ValidateSmallAlmostAscii
);
144 BENCHMARK(ValidateSmallNonAscii
);
145 BENCHMARK(ValidateLargeAscii
);
146 BENCHMARK(ValidateLargeAlmostAscii
);
147 BENCHMARK(ValidateLargeNonAscii
);