]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/tdigest_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / tdigest_test.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // XXX: There's no rigid error bound available. The accuracy is to some degree
19 // *random*, which depends on input data and quantiles to be calculated. I also
20 // find small gaps among linux/windows/macos.
21 // In below tests, most quantiles are within 1% deviation from exact values,
22 // while the worst test case is about 10% drift.
23 // To make test result stable, I relaxed error bound to be *good enough*.
24 // #define _TDIGEST_STRICT_TEST // enable more strict tests
25
26 #include <algorithm>
27 #include <cmath>
28 #include <vector>
29
30 #include <gtest/gtest.h>
31
32 #include "arrow/testing/gtest_util.h"
33 #include "arrow/testing/random.h"
34 #include "arrow/testing/util.h"
35 #include "arrow/util/tdigest.h"
36
37 namespace arrow {
38 namespace internal {
39
40 TEST(TDigestTest, SingleValue) {
41 const double value = 0.12345678;
42
43 TDigest td;
44 td.Add(value);
45 ASSERT_OK(td.Validate());
46 // all quantiles equal to same single vaue
47 for (double q = 0; q <= 1; q += 0.1) {
48 EXPECT_EQ(td.Quantile(q), value);
49 }
50 }
51
52 TEST(TDigestTest, FewValues) {
53 // exact quantile at 0.1 interval, test sorted and unsorted input
54 std::vector<std::vector<double>> values_vector = {
55 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
56 {4, 1, 9, 0, 3, 2, 5, 6, 8, 7, 10},
57 };
58
59 for (const auto& values : values_vector) {
60 TDigest td;
61 for (double v : values) {
62 td.Add(v);
63 }
64 ASSERT_OK(td.Validate());
65
66 double q = 0;
67 for (size_t i = 0; i < values.size(); ++i) {
68 double expected = static_cast<double>(i);
69 EXPECT_EQ(td.Quantile(q), expected);
70 q += 0.1;
71 }
72 }
73 }
74
75 // Calculate exact quantile as truth
76 std::vector<double> ExactQuantile(std::vector<double> values,
77 const std::vector<double>& quantiles) {
78 std::sort(values.begin(), values.end());
79
80 std::vector<double> output;
81 for (double q : quantiles) {
82 const double index = (values.size() - 1) * q;
83 const int64_t lower_index = static_cast<int64_t>(index);
84 const double fraction = index - lower_index;
85 if (fraction == 0) {
86 output.push_back(values[lower_index]);
87 } else {
88 const double lerp =
89 fraction * values[lower_index + 1] + (1 - fraction) * values[lower_index];
90 output.push_back(lerp);
91 }
92 }
93 return output;
94 }
95
96 void TestRandom(size_t size) {
97 const std::vector<double> fixed_quantiles = {0, 0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99, 1};
98
99 // append random quantiles to test
100 std::vector<double> quantiles;
101 random_real(50, 0x11223344, 0.0, 1.0, &quantiles);
102 quantiles.insert(quantiles.end(), fixed_quantiles.cbegin(), fixed_quantiles.cend());
103
104 // generate random test values
105 const double min = 1e3, max = 1e10;
106 std::vector<double> values;
107 random_real(size, 0x11223344, min, max, &values);
108
109 TDigest td(200);
110 for (double value : values) {
111 td.Add(value);
112 }
113 ASSERT_OK(td.Validate());
114
115 const std::vector<double> expected = ExactQuantile(values, quantiles);
116 std::vector<double> approximated;
117 for (auto q : quantiles) {
118 approximated.push_back(td.Quantile(q));
119 }
120
121 // r-square of expected and approximated quantiles should be greater than 0.999
122 const double expected_mean =
123 std::accumulate(expected.begin(), expected.end(), 0.0) / expected.size();
124 double rss = 0, tss = 0;
125 for (size_t i = 0; i < quantiles.size(); ++i) {
126 rss += (expected[i] - approximated[i]) * (expected[i] - approximated[i]);
127 tss += (expected[i] - expected_mean) * (expected[i] - expected_mean);
128 }
129 const double r2 = 1 - rss / tss;
130 EXPECT_GT(r2, 0.999);
131
132 // make sure no quantile drifts too much from the truth
133 #ifdef _TDIGEST_STRICT_TEST
134 const double error_ratio = 0.02;
135 #else
136 const double error_ratio = 0.05;
137 #endif
138 for (size_t i = 0; i < quantiles.size(); ++i) {
139 const double tolerance = std::fabs(expected[i]) * error_ratio;
140 EXPECT_NEAR(approximated[i], expected[i], tolerance) << quantiles[i];
141 }
142 }
143
144 TEST(TDigestTest, RandomValues) { TestRandom(100000); }
145
146 // too heavy to run in ci
147 TEST(TDigestTest, DISABLED_HugeVolume) { TestRandom(1U << 30); }
148
149 void TestMerge(const std::vector<std::vector<double>>& values_vector, uint32_t delta,
150 double error_ratio) {
151 const std::vector<double> quantiles = {0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
152 0.6, 0.7, 0.8, 0.9, 0.99, 1};
153
154 std::vector<TDigest> tds;
155 for (const auto& values : values_vector) {
156 TDigest td(delta);
157 for (double value : values) {
158 td.Add(value);
159 }
160 ASSERT_OK(td.Validate());
161 tds.push_back(std::move(td));
162 }
163
164 std::vector<double> values_combined;
165 for (const auto& values : values_vector) {
166 values_combined.insert(values_combined.end(), values.begin(), values.end());
167 }
168 const std::vector<double> expected = ExactQuantile(values_combined, quantiles);
169
170 // merge into an empty tdigest
171 {
172 TDigest td(delta);
173 td.Merge(tds);
174 ASSERT_OK(td.Validate());
175 for (size_t i = 0; i < quantiles.size(); ++i) {
176 const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
177 EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
178 }
179 }
180
181 // merge into a non empty tdigest
182 {
183 TDigest td = std::move(tds[0]);
184 tds.erase(tds.begin(), tds.begin() + 1);
185 td.Merge(tds);
186 ASSERT_OK(td.Validate());
187 for (size_t i = 0; i < quantiles.size(); ++i) {
188 const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
189 EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
190 }
191 }
192 }
193
194 // merge tdigests with same distribution
195 TEST(TDigestTest, MergeUniform) {
196 const std::vector<size_t> sizes = {20000, 3000, 1500, 18000, 9999, 6666};
197 std::vector<std::vector<double>> values_vector;
198 for (auto size : sizes) {
199 std::vector<double> values;
200 random_real(size, 0x11223344, -123456789.0, 987654321.0, &values);
201 values_vector.push_back(std::move(values));
202 }
203
204 #ifdef _TDIGEST_STRICT_TEST
205 TestMerge(values_vector, /*delta=*/100, /*error_ratio=*/0.01);
206 #else
207 TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.05);
208 #endif
209 }
210
211 // merge tdigests with different distributions
212 TEST(TDigestTest, MergeNonUniform) {
213 struct {
214 size_t size;
215 double min;
216 double max;
217 } configs[] = {
218 {2000, 1e8, 1e9}, {0, 0, 0}, {3000, -1, 1}, {500, -1e6, -1e5}, {800, 100, 100},
219 };
220 std::vector<std::vector<double>> values_vector;
221 for (const auto& cfg : configs) {
222 std::vector<double> values;
223 random_real(cfg.size, 0x11223344, cfg.min, cfg.max, &values);
224 values_vector.push_back(std::move(values));
225 }
226
227 #ifdef _TDIGEST_STRICT_TEST
228 TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.01);
229 #else
230 TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.05);
231 #endif
232 }
233
234 TEST(TDigestTest, Misc) {
235 const size_t size = 100000;
236 const double min = -1000, max = 1000;
237 const std::vector<double> quantiles = {0, 0.01, 0.1, 0.4, 0.7, 0.9, 0.99, 1};
238
239 std::vector<double> values;
240 random_real(size, 0x11223344, min, max, &values);
241 const std::vector<double> expected = ExactQuantile(values, quantiles);
242
243 // test small delta and buffer
244 {
245 #ifdef _TDIGEST_STRICT_TEST
246 const double error_ratio = 0.06; // low accuracy for small delta
247 #else
248 const double error_ratio = 0.15;
249 #endif
250
251 TDigest td(10, 50);
252 for (double value : values) {
253 td.Add(value);
254 }
255 ASSERT_OK(td.Validate());
256
257 for (size_t i = 0; i < quantiles.size(); ++i) {
258 const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
259 EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
260 }
261 }
262
263 // test many duplicated values
264 {
265 #ifdef _TDIGEST_STRICT_TEST
266 const double error_ratio = 0.02;
267 #else
268 const double error_ratio = 0.05;
269 #endif
270
271 auto values_integer = values;
272 for (double& value : values_integer) {
273 value = std::ceil(value);
274 }
275
276 TDigest td(100);
277 for (double value : values_integer) {
278 td.Add(value);
279 }
280 ASSERT_OK(td.Validate());
281
282 for (size_t i = 0; i < quantiles.size(); ++i) {
283 const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
284 EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
285 }
286 }
287 }
288
289 } // namespace internal
290 } // namespace arrow