]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/tdigest_test.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 // XXX: There's no rigid error bound available. The accuracy is to some degree
19 // *random*, which depends on input data and quantiles to be calculated. I also
20 // find small gaps among linux/windows/macos.
21 // In below tests, most quantiles are within 1% deviation from exact values,
22 // while the worst test case is about 10% drift.
23 // To make test result stable, I relaxed error bound to be *good enough*.
24 // #define _TDIGEST_STRICT_TEST // enable more strict tests
30 #include <gtest/gtest.h>
32 #include "arrow/testing/gtest_util.h"
33 #include "arrow/testing/random.h"
34 #include "arrow/testing/util.h"
35 #include "arrow/util/tdigest.h"
40 TEST(TDigestTest
, SingleValue
) {
41 const double value
= 0.12345678;
45 ASSERT_OK(td
.Validate());
46 // all quantiles equal to same single vaue
47 for (double q
= 0; q
<= 1; q
+= 0.1) {
48 EXPECT_EQ(td
.Quantile(q
), value
);
52 TEST(TDigestTest
, FewValues
) {
53 // exact quantile at 0.1 interval, test sorted and unsorted input
54 std::vector
<std::vector
<double>> values_vector
= {
55 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
56 {4, 1, 9, 0, 3, 2, 5, 6, 8, 7, 10},
59 for (const auto& values
: values_vector
) {
61 for (double v
: values
) {
64 ASSERT_OK(td
.Validate());
67 for (size_t i
= 0; i
< values
.size(); ++i
) {
68 double expected
= static_cast<double>(i
);
69 EXPECT_EQ(td
.Quantile(q
), expected
);
75 // Calculate exact quantile as truth
76 std::vector
<double> ExactQuantile(std::vector
<double> values
,
77 const std::vector
<double>& quantiles
) {
78 std::sort(values
.begin(), values
.end());
80 std::vector
<double> output
;
81 for (double q
: quantiles
) {
82 const double index
= (values
.size() - 1) * q
;
83 const int64_t lower_index
= static_cast<int64_t>(index
);
84 const double fraction
= index
- lower_index
;
86 output
.push_back(values
[lower_index
]);
89 fraction
* values
[lower_index
+ 1] + (1 - fraction
) * values
[lower_index
];
90 output
.push_back(lerp
);
96 void TestRandom(size_t size
) {
97 const std::vector
<double> fixed_quantiles
= {0, 0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99, 1};
99 // append random quantiles to test
100 std::vector
<double> quantiles
;
101 random_real(50, 0x11223344, 0.0, 1.0, &quantiles
);
102 quantiles
.insert(quantiles
.end(), fixed_quantiles
.cbegin(), fixed_quantiles
.cend());
104 // generate random test values
105 const double min
= 1e3
, max
= 1e10
;
106 std::vector
<double> values
;
107 random_real(size
, 0x11223344, min
, max
, &values
);
110 for (double value
: values
) {
113 ASSERT_OK(td
.Validate());
115 const std::vector
<double> expected
= ExactQuantile(values
, quantiles
);
116 std::vector
<double> approximated
;
117 for (auto q
: quantiles
) {
118 approximated
.push_back(td
.Quantile(q
));
121 // r-square of expected and approximated quantiles should be greater than 0.999
122 const double expected_mean
=
123 std::accumulate(expected
.begin(), expected
.end(), 0.0) / expected
.size();
124 double rss
= 0, tss
= 0;
125 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
126 rss
+= (expected
[i
] - approximated
[i
]) * (expected
[i
] - approximated
[i
]);
127 tss
+= (expected
[i
] - expected_mean
) * (expected
[i
] - expected_mean
);
129 const double r2
= 1 - rss
/ tss
;
130 EXPECT_GT(r2
, 0.999);
132 // make sure no quantile drifts too much from the truth
133 #ifdef _TDIGEST_STRICT_TEST
134 const double error_ratio
= 0.02;
136 const double error_ratio
= 0.05;
138 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
139 const double tolerance
= std::fabs(expected
[i
]) * error_ratio
;
140 EXPECT_NEAR(approximated
[i
], expected
[i
], tolerance
) << quantiles
[i
];
144 TEST(TDigestTest
, RandomValues
) { TestRandom(100000); }
146 // too heavy to run in ci
147 TEST(TDigestTest
, DISABLED_HugeVolume
) { TestRandom(1U << 30); }
149 void TestMerge(const std::vector
<std::vector
<double>>& values_vector
, uint32_t delta
,
150 double error_ratio
) {
151 const std::vector
<double> quantiles
= {0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
152 0.6, 0.7, 0.8, 0.9, 0.99, 1};
154 std::vector
<TDigest
> tds
;
155 for (const auto& values
: values_vector
) {
157 for (double value
: values
) {
160 ASSERT_OK(td
.Validate());
161 tds
.push_back(std::move(td
));
164 std::vector
<double> values_combined
;
165 for (const auto& values
: values_vector
) {
166 values_combined
.insert(values_combined
.end(), values
.begin(), values
.end());
168 const std::vector
<double> expected
= ExactQuantile(values_combined
, quantiles
);
170 // merge into an empty tdigest
174 ASSERT_OK(td
.Validate());
175 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
176 const double tolerance
= std::max(std::fabs(expected
[i
]) * error_ratio
, 0.1);
177 EXPECT_NEAR(td
.Quantile(quantiles
[i
]), expected
[i
], tolerance
) << quantiles
[i
];
181 // merge into a non empty tdigest
183 TDigest td
= std::move(tds
[0]);
184 tds
.erase(tds
.begin(), tds
.begin() + 1);
186 ASSERT_OK(td
.Validate());
187 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
188 const double tolerance
= std::max(std::fabs(expected
[i
]) * error_ratio
, 0.1);
189 EXPECT_NEAR(td
.Quantile(quantiles
[i
]), expected
[i
], tolerance
) << quantiles
[i
];
194 // merge tdigests with same distribution
195 TEST(TDigestTest
, MergeUniform
) {
196 const std::vector
<size_t> sizes
= {20000, 3000, 1500, 18000, 9999, 6666};
197 std::vector
<std::vector
<double>> values_vector
;
198 for (auto size
: sizes
) {
199 std::vector
<double> values
;
200 random_real(size
, 0x11223344, -123456789.0, 987654321.0, &values
);
201 values_vector
.push_back(std::move(values
));
204 #ifdef _TDIGEST_STRICT_TEST
205 TestMerge(values_vector
, /*delta=*/100, /*error_ratio=*/0.01);
207 TestMerge(values_vector
, /*delta=*/200, /*error_ratio=*/0.05);
211 // merge tdigests with different distributions
212 TEST(TDigestTest
, MergeNonUniform
) {
218 {2000, 1e8
, 1e9
}, {0, 0, 0}, {3000, -1, 1}, {500, -1e6
, -1e5
}, {800, 100, 100},
220 std::vector
<std::vector
<double>> values_vector
;
221 for (const auto& cfg
: configs
) {
222 std::vector
<double> values
;
223 random_real(cfg
.size
, 0x11223344, cfg
.min
, cfg
.max
, &values
);
224 values_vector
.push_back(std::move(values
));
227 #ifdef _TDIGEST_STRICT_TEST
228 TestMerge(values_vector
, /*delta=*/200, /*error_ratio=*/0.01);
230 TestMerge(values_vector
, /*delta=*/200, /*error_ratio=*/0.05);
234 TEST(TDigestTest
, Misc
) {
235 const size_t size
= 100000;
236 const double min
= -1000, max
= 1000;
237 const std::vector
<double> quantiles
= {0, 0.01, 0.1, 0.4, 0.7, 0.9, 0.99, 1};
239 std::vector
<double> values
;
240 random_real(size
, 0x11223344, min
, max
, &values
);
241 const std::vector
<double> expected
= ExactQuantile(values
, quantiles
);
243 // test small delta and buffer
245 #ifdef _TDIGEST_STRICT_TEST
246 const double error_ratio
= 0.06; // low accuracy for small delta
248 const double error_ratio
= 0.15;
252 for (double value
: values
) {
255 ASSERT_OK(td
.Validate());
257 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
258 const double tolerance
= std::max(std::fabs(expected
[i
]) * error_ratio
, 0.1);
259 EXPECT_NEAR(td
.Quantile(quantiles
[i
]), expected
[i
], tolerance
) << quantiles
[i
];
263 // test many duplicated values
265 #ifdef _TDIGEST_STRICT_TEST
266 const double error_ratio
= 0.02;
268 const double error_ratio
= 0.05;
271 auto values_integer
= values
;
272 for (double& value
: values_integer
) {
273 value
= std::ceil(value
);
277 for (double value
: values_integer
) {
280 ASSERT_OK(td
.Validate());
282 for (size_t i
= 0; i
< quantiles
.size(); ++i
) {
283 const double tolerance
= std::max(std::fabs(expected
[i
]) * error_ratio
, 0.1);
284 EXPECT_NEAR(td
.Quantile(quantiles
[i
]), expected
[i
], tolerance
) << quantiles
[i
];
289 } // namespace internal