]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "benchmark/benchmark.h" | |
19 | ||
20 | #include <string> | |
21 | #include <type_traits> | |
22 | #include <vector> | |
23 | ||
24 | #include "arrow/testing/gtest_util.h" | |
25 | #include "arrow/util/utf8.h" | |
26 | ||
27 | // Do not benchmark inlined functions directly inside the benchmark loop | |
28 | static ARROW_NOINLINE bool ValidateUTF8NoInline(const uint8_t* data, int64_t size) { | |
29 | return ::arrow::util::ValidateUTF8(data, size); | |
30 | } | |
31 | ||
32 | static ARROW_NOINLINE bool ValidateAsciiNoInline(const uint8_t* data, int64_t size) { | |
33 | return ::arrow::util::ValidateAscii(data, size); | |
34 | } | |
35 | ||
36 | namespace arrow { | |
37 | namespace util { | |
38 | ||
39 | static const char* tiny_valid_ascii = "characters"; | |
40 | static const char* tiny_valid_non_ascii = "caractères"; | |
41 | ||
42 | static const char* valid_ascii = | |
43 | "UTF-8 is a variable width character encoding capable of encoding all 1,112,064 " | |
44 | "valid code points in Unicode using one to four 8-bit bytes"; | |
45 | static const char* valid_almost_ascii = | |
46 | "UTF-8 est un codage de caractères informatiques conçu pour coder l’ensemble des " | |
47 | "caractères du « répertoire universel de caractères codés »"; | |
48 | static const char* valid_non_ascii = | |
49 | "UTF-8 はISO/IEC 10646 (UCS) " | |
50 | "とUnicodeで使える8ビット符号単位の文字符号化形式及び文字符号化スキーム。 "; | |
51 | ||
52 | static std::string MakeLargeString(const std::string& base, int64_t nbytes) { | |
53 | int64_t nrepeats = (nbytes + base.size() - 1) / base.size(); | |
54 | std::string s; | |
55 | s.reserve(nrepeats * nbytes); | |
56 | for (int64_t i = 0; i < nrepeats; ++i) { | |
57 | s += base; | |
58 | } | |
59 | return s; | |
60 | } | |
61 | ||
62 | static void BenchmarkUTF8Validation( | |
63 | benchmark::State& state, // NOLINT non-const reference | |
64 | const std::string& s, bool expected) { | |
65 | auto data = reinterpret_cast<const uint8_t*>(s.data()); | |
66 | auto data_size = static_cast<int64_t>(s.size()); | |
67 | ||
68 | InitializeUTF8(); | |
69 | bool b = ValidateUTF8NoInline(data, data_size); | |
70 | if (b != expected) { | |
71 | std::cerr << "Unexpected validation result" << std::endl; | |
72 | std::abort(); | |
73 | } | |
74 | ||
75 | while (state.KeepRunning()) { | |
76 | bool b = ValidateUTF8NoInline(data, data_size); | |
77 | benchmark::DoNotOptimize(b); | |
78 | } | |
79 | state.SetBytesProcessed(state.iterations() * s.size()); | |
80 | } | |
81 | ||
82 | static void BenchmarkASCIIValidation( | |
83 | benchmark::State& state, // NOLINT non-const reference | |
84 | const std::string& s, bool expected) { | |
85 | auto data = reinterpret_cast<const uint8_t*>(s.data()); | |
86 | auto data_size = static_cast<int64_t>(s.size()); | |
87 | ||
88 | bool b = ValidateAsciiNoInline(data, data_size); | |
89 | if (b != expected) { | |
90 | std::cerr << "Unexpected validation result" << std::endl; | |
91 | std::abort(); | |
92 | } | |
93 | ||
94 | while (state.KeepRunning()) { | |
95 | bool b = ValidateAsciiNoInline(data, data_size); | |
96 | benchmark::DoNotOptimize(b); | |
97 | } | |
98 | state.SetBytesProcessed(state.iterations() * s.size()); | |
99 | } | |
100 | ||
101 | static void ValidateTinyAscii(benchmark::State& state) { // NOLINT non-const reference | |
102 | BenchmarkASCIIValidation(state, tiny_valid_ascii, true); | |
103 | } | |
104 | ||
105 | static void ValidateTinyNonAscii(benchmark::State& state) { // NOLINT non-const reference | |
106 | BenchmarkUTF8Validation(state, tiny_valid_non_ascii, true); | |
107 | } | |
108 | ||
109 | static void ValidateSmallAscii(benchmark::State& state) { // NOLINT non-const reference | |
110 | BenchmarkASCIIValidation(state, valid_ascii, true); | |
111 | } | |
112 | ||
113 | static void ValidateSmallAlmostAscii( | |
114 | benchmark::State& state) { // NOLINT non-const reference | |
115 | BenchmarkUTF8Validation(state, valid_almost_ascii, true); | |
116 | } | |
117 | ||
118 | static void ValidateSmallNonAscii( | |
119 | benchmark::State& state) { // NOLINT non-const reference | |
120 | BenchmarkUTF8Validation(state, valid_non_ascii, true); | |
121 | } | |
122 | ||
123 | static void ValidateLargeAscii(benchmark::State& state) { // NOLINT non-const reference | |
124 | auto s = MakeLargeString(valid_ascii, 100000); | |
125 | BenchmarkASCIIValidation(state, s, true); | |
126 | } | |
127 | ||
128 | static void ValidateLargeAlmostAscii( | |
129 | benchmark::State& state) { // NOLINT non-const reference | |
130 | auto s = MakeLargeString(valid_almost_ascii, 100000); | |
131 | BenchmarkUTF8Validation(state, s, true); | |
132 | } | |
133 | ||
134 | static void ValidateLargeNonAscii( | |
135 | benchmark::State& state) { // NOLINT non-const reference | |
136 | auto s = MakeLargeString(valid_non_ascii, 100000); | |
137 | BenchmarkUTF8Validation(state, s, true); | |
138 | } | |
139 | ||
140 | BENCHMARK(ValidateTinyAscii); | |
141 | BENCHMARK(ValidateTinyNonAscii); | |
142 | BENCHMARK(ValidateSmallAscii); | |
143 | BENCHMARK(ValidateSmallAlmostAscii); | |
144 | BENCHMARK(ValidateSmallNonAscii); | |
145 | BENCHMARK(ValidateLargeAscii); | |
146 | BENCHMARK(ValidateLargeAlmostAscii); | |
147 | BENCHMARK(ValidateLargeNonAscii); | |
148 | ||
149 | } // namespace util | |
150 | } // namespace arrow |