]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/utf8_util_benchmark.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / utf8_util_benchmark.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include <string>
21 #include <type_traits>
22 #include <vector>
23
24 #include "arrow/testing/gtest_util.h"
25 #include "arrow/util/utf8.h"
26
27 // Do not benchmark inlined functions directly inside the benchmark loop
28 static ARROW_NOINLINE bool ValidateUTF8NoInline(const uint8_t* data, int64_t size) {
29 return ::arrow::util::ValidateUTF8(data, size);
30 }
31
32 static ARROW_NOINLINE bool ValidateAsciiNoInline(const uint8_t* data, int64_t size) {
33 return ::arrow::util::ValidateAscii(data, size);
34 }
35
36 namespace arrow {
37 namespace util {
38
39 static const char* tiny_valid_ascii = "characters";
40 static const char* tiny_valid_non_ascii = "caractères";
41
42 static const char* valid_ascii =
43 "UTF-8 is a variable width character encoding capable of encoding all 1,112,064 "
44 "valid code points in Unicode using one to four 8-bit bytes";
45 static const char* valid_almost_ascii =
46 "UTF-8 est un codage de caractères informatiques conçu pour coder l’ensemble des "
47 "caractères du « répertoire universel de caractères codés »";
48 static const char* valid_non_ascii =
49 "UTF-8 はISO/IEC 10646 (UCS) "
50 "とUnicodeで使える8ビット符号単位の文字符号化形式及び文字符号化スキーム。 ";
51
52 static std::string MakeLargeString(const std::string& base, int64_t nbytes) {
53 int64_t nrepeats = (nbytes + base.size() - 1) / base.size();
54 std::string s;
55 s.reserve(nrepeats * nbytes);
56 for (int64_t i = 0; i < nrepeats; ++i) {
57 s += base;
58 }
59 return s;
60 }
61
62 static void BenchmarkUTF8Validation(
63 benchmark::State& state, // NOLINT non-const reference
64 const std::string& s, bool expected) {
65 auto data = reinterpret_cast<const uint8_t*>(s.data());
66 auto data_size = static_cast<int64_t>(s.size());
67
68 InitializeUTF8();
69 bool b = ValidateUTF8NoInline(data, data_size);
70 if (b != expected) {
71 std::cerr << "Unexpected validation result" << std::endl;
72 std::abort();
73 }
74
75 while (state.KeepRunning()) {
76 bool b = ValidateUTF8NoInline(data, data_size);
77 benchmark::DoNotOptimize(b);
78 }
79 state.SetBytesProcessed(state.iterations() * s.size());
80 }
81
82 static void BenchmarkASCIIValidation(
83 benchmark::State& state, // NOLINT non-const reference
84 const std::string& s, bool expected) {
85 auto data = reinterpret_cast<const uint8_t*>(s.data());
86 auto data_size = static_cast<int64_t>(s.size());
87
88 bool b = ValidateAsciiNoInline(data, data_size);
89 if (b != expected) {
90 std::cerr << "Unexpected validation result" << std::endl;
91 std::abort();
92 }
93
94 while (state.KeepRunning()) {
95 bool b = ValidateAsciiNoInline(data, data_size);
96 benchmark::DoNotOptimize(b);
97 }
98 state.SetBytesProcessed(state.iterations() * s.size());
99 }
100
101 static void ValidateTinyAscii(benchmark::State& state) { // NOLINT non-const reference
102 BenchmarkASCIIValidation(state, tiny_valid_ascii, true);
103 }
104
105 static void ValidateTinyNonAscii(benchmark::State& state) { // NOLINT non-const reference
106 BenchmarkUTF8Validation(state, tiny_valid_non_ascii, true);
107 }
108
109 static void ValidateSmallAscii(benchmark::State& state) { // NOLINT non-const reference
110 BenchmarkASCIIValidation(state, valid_ascii, true);
111 }
112
113 static void ValidateSmallAlmostAscii(
114 benchmark::State& state) { // NOLINT non-const reference
115 BenchmarkUTF8Validation(state, valid_almost_ascii, true);
116 }
117
118 static void ValidateSmallNonAscii(
119 benchmark::State& state) { // NOLINT non-const reference
120 BenchmarkUTF8Validation(state, valid_non_ascii, true);
121 }
122
123 static void ValidateLargeAscii(benchmark::State& state) { // NOLINT non-const reference
124 auto s = MakeLargeString(valid_ascii, 100000);
125 BenchmarkASCIIValidation(state, s, true);
126 }
127
128 static void ValidateLargeAlmostAscii(
129 benchmark::State& state) { // NOLINT non-const reference
130 auto s = MakeLargeString(valid_almost_ascii, 100000);
131 BenchmarkUTF8Validation(state, s, true);
132 }
133
134 static void ValidateLargeNonAscii(
135 benchmark::State& state) { // NOLINT non-const reference
136 auto s = MakeLargeString(valid_non_ascii, 100000);
137 BenchmarkUTF8Validation(state, s, true);
138 }
139
140 BENCHMARK(ValidateTinyAscii);
141 BENCHMARK(ValidateTinyNonAscii);
142 BENCHMARK(ValidateSmallAscii);
143 BENCHMARK(ValidateSmallAlmostAscii);
144 BENCHMARK(ValidateSmallNonAscii);
145 BENCHMARK(ValidateLargeAscii);
146 BENCHMARK(ValidateLargeAlmostAscii);
147 BENCHMARK(ValidateLargeNonAscii);
148
149 } // namespace util
150 } // namespace arrow