ceph/src/arrow/cpp/src/arrow/util/tdigest_test.cc

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 // XXX: There's no rigid error bound available. The accuracy is to some degree
  19 // *random*, which depends on input data and quantiles to be calculated. I also
  20 // find small gaps among linux/windows/macos.
  21 // In below tests, most quantiles are within 1% deviation from exact values,
  22 // while the worst test case is about 10% drift.
  23 // To make test result stable, I relaxed error bound to be *good enough*.
  24 // #define _TDIGEST_STRICT_TEST   // enable more strict tests
  25
  26 #include <algorithm>
  27 #include <cmath>
  28 #include <vector>
  29
  30 #include <gtest/gtest.h>
  31
  32 #include "arrow/testing/gtest_util.h"
  33 #include "arrow/testing/random.h"
  34 #include "arrow/testing/util.h"
  35 #include "arrow/util/tdigest.h"
  36
  37 namespace arrow {
  38 namespace internal {
  39
  40 TEST(TDigestTest, SingleValue) {
  41   const double value = 0.12345678;
  42
  43   TDigest td;
  44   td.Add(value);
  45   ASSERT_OK(td.Validate());
  46   // all quantiles equal to same single vaue
  47   for (double q = 0; q <= 1; q += 0.1) {
  48     EXPECT_EQ(td.Quantile(q), value);
  49   }
  50 }
  51
  52 TEST(TDigestTest, FewValues) {
  53   // exact quantile at 0.1 interval, test sorted and unsorted input
  54   std::vector<std::vector<double>> values_vector = {
  55       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
  56       {4, 1, 9, 0, 3, 2, 5, 6, 8, 7, 10},
  57   };
  58
  59   for (const auto& values : values_vector) {
  60     TDigest td;
  61     for (double v : values) {
  62       td.Add(v);
  63     }
  64     ASSERT_OK(td.Validate());
  65
  66     double q = 0;
  67     for (size_t i = 0; i < values.size(); ++i) {
  68       double expected = static_cast<double>(i);
  69       EXPECT_EQ(td.Quantile(q), expected);
  70       q += 0.1;
  71     }
  72   }
  73 }
  74
  75 // Calculate exact quantile as truth
  76 std::vector<double> ExactQuantile(std::vector<double> values,
  77                                   const std::vector<double>& quantiles) {
  78   std::sort(values.begin(), values.end());
  79
  80   std::vector<double> output;
  81   for (double q : quantiles) {
  82     const double index = (values.size() - 1) * q;
  83     const int64_t lower_index = static_cast<int64_t>(index);
  84     const double fraction = index - lower_index;
  85     if (fraction == 0) {
  86       output.push_back(values[lower_index]);
  87     } else {
  88       const double lerp =
  89           fraction * values[lower_index + 1] + (1 - fraction) * values[lower_index];
  90       output.push_back(lerp);
  91     }
  92   }
  93   return output;
  94 }
  95
  96 void TestRandom(size_t size) {
  97   const std::vector<double> fixed_quantiles = {0, 0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99, 1};
  98
  99   // append random quantiles to test
 100   std::vector<double> quantiles;
 101   random_real(50, 0x11223344, 0.0, 1.0, &quantiles);
 102   quantiles.insert(quantiles.end(), fixed_quantiles.cbegin(), fixed_quantiles.cend());
 103
 104   // generate random test values
 105   const double min = 1e3, max = 1e10;
 106   std::vector<double> values;
 107   random_real(size, 0x11223344, min, max, &values);
 108
 109   TDigest td(200);
 110   for (double value : values) {
 111     td.Add(value);
 112   }
 113   ASSERT_OK(td.Validate());
 114
 115   const std::vector<double> expected = ExactQuantile(values, quantiles);
 116   std::vector<double> approximated;
 117   for (auto q : quantiles) {
 118     approximated.push_back(td.Quantile(q));
 119   }
 120
 121   // r-square of expected and approximated quantiles should be greater than 0.999
 122   const double expected_mean =
 123       std::accumulate(expected.begin(), expected.end(), 0.0) / expected.size();
 124   double rss = 0, tss = 0;
 125   for (size_t i = 0; i < quantiles.size(); ++i) {
 126     rss += (expected[i] - approximated[i]) * (expected[i] - approximated[i]);
 127     tss += (expected[i] - expected_mean) * (expected[i] - expected_mean);
 128   }
 129   const double r2 = 1 - rss / tss;
 130   EXPECT_GT(r2, 0.999);
 131
 132   // make sure no quantile drifts too much from the truth
 133 #ifdef _TDIGEST_STRICT_TEST
 134   const double error_ratio = 0.02;
 135 #else
 136   const double error_ratio = 0.05;
 137 #endif
 138   for (size_t i = 0; i < quantiles.size(); ++i) {
 139     const double tolerance = std::fabs(expected[i]) * error_ratio;
 140     EXPECT_NEAR(approximated[i], expected[i], tolerance) << quantiles[i];
 141   }
 142 }
 143
 144 TEST(TDigestTest, RandomValues) { TestRandom(100000); }
 145
 146 // too heavy to run in ci
 147 TEST(TDigestTest, DISABLED_HugeVolume) { TestRandom(1U << 30); }
 148
 149 void TestMerge(const std::vector<std::vector<double>>& values_vector, uint32_t delta,
 150                double error_ratio) {
 151   const std::vector<double> quantiles = {0,   0.01, 0.1, 0.2, 0.3,  0.4, 0.5,
 152                                          0.6, 0.7,  0.8, 0.9, 0.99, 1};
 153
 154   std::vector<TDigest> tds;
 155   for (const auto& values : values_vector) {
 156     TDigest td(delta);
 157     for (double value : values) {
 158       td.Add(value);
 159     }
 160     ASSERT_OK(td.Validate());
 161     tds.push_back(std::move(td));
 162   }
 163
 164   std::vector<double> values_combined;
 165   for (const auto& values : values_vector) {
 166     values_combined.insert(values_combined.end(), values.begin(), values.end());
 167   }
 168   const std::vector<double> expected = ExactQuantile(values_combined, quantiles);
 169
 170   // merge into an empty tdigest
 171   {
 172     TDigest td(delta);
 173     td.Merge(tds);
 174     ASSERT_OK(td.Validate());
 175     for (size_t i = 0; i < quantiles.size(); ++i) {
 176       const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
 177       EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
 178     }
 179   }
 180
 181   // merge into a non empty tdigest
 182   {
 183     TDigest td = std::move(tds[0]);
 184     tds.erase(tds.begin(), tds.begin() + 1);
 185     td.Merge(tds);
 186     ASSERT_OK(td.Validate());
 187     for (size_t i = 0; i < quantiles.size(); ++i) {
 188       const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
 189       EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
 190     }
 191   }
 192 }
 193
 194 // merge tdigests with same distribution
 195 TEST(TDigestTest, MergeUniform) {
 196   const std::vector<size_t> sizes = {20000, 3000, 1500, 18000, 9999, 6666};
 197   std::vector<std::vector<double>> values_vector;
 198   for (auto size : sizes) {
 199     std::vector<double> values;
 200     random_real(size, 0x11223344, -123456789.0, 987654321.0, &values);
 201     values_vector.push_back(std::move(values));
 202   }
 203
 204 #ifdef _TDIGEST_STRICT_TEST
 205   TestMerge(values_vector, /*delta=*/100, /*error_ratio=*/0.01);
 206 #else
 207   TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.05);
 208 #endif
 209 }
 210
 211 // merge tdigests with different distributions
 212 TEST(TDigestTest, MergeNonUniform) {
 213   struct {
 214     size_t size;
 215     double min;
 216     double max;
 217   } configs[] = {
 218       {2000, 1e8, 1e9}, {0, 0, 0}, {3000, -1, 1}, {500, -1e6, -1e5}, {800, 100, 100},
 219   };
 220   std::vector<std::vector<double>> values_vector;
 221   for (const auto& cfg : configs) {
 222     std::vector<double> values;
 223     random_real(cfg.size, 0x11223344, cfg.min, cfg.max, &values);
 224     values_vector.push_back(std::move(values));
 225   }
 226
 227 #ifdef _TDIGEST_STRICT_TEST
 228   TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.01);
 229 #else
 230   TestMerge(values_vector, /*delta=*/200, /*error_ratio=*/0.05);
 231 #endif
 232 }
 233
 234 TEST(TDigestTest, Misc) {
 235   const size_t size = 100000;
 236   const double min = -1000, max = 1000;
 237   const std::vector<double> quantiles = {0, 0.01, 0.1, 0.4, 0.7, 0.9, 0.99, 1};
 238
 239   std::vector<double> values;
 240   random_real(size, 0x11223344, min, max, &values);
 241   const std::vector<double> expected = ExactQuantile(values, quantiles);
 242
 243   // test small delta and buffer
 244   {
 245 #ifdef _TDIGEST_STRICT_TEST
 246     const double error_ratio = 0.06;  // low accuracy for small delta
 247 #else
 248     const double error_ratio = 0.15;
 249 #endif
 250
 251     TDigest td(10, 50);
 252     for (double value : values) {
 253       td.Add(value);
 254     }
 255     ASSERT_OK(td.Validate());
 256
 257     for (size_t i = 0; i < quantiles.size(); ++i) {
 258       const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
 259       EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
 260     }
 261   }
 262
 263   // test many duplicated values
 264   {
 265 #ifdef _TDIGEST_STRICT_TEST
 266     const double error_ratio = 0.02;
 267 #else
 268     const double error_ratio = 0.05;
 269 #endif
 270
 271     auto values_integer = values;
 272     for (double& value : values_integer) {
 273       value = std::ceil(value);
 274     }
 275
 276     TDigest td(100);
 277     for (double value : values_integer) {
 278       td.Add(value);
 279     }
 280     ASSERT_OK(td.Validate());
 281
 282     for (size_t i = 0; i < quantiles.size(); ++i) {
 283       const double tolerance = std::max(std::fabs(expected[i]) * error_ratio, 0.1);
 284       EXPECT_NEAR(td.Quantile(quantiles[i]), expected[i], tolerance) << quantiles[i];
 285     }
 286   }
 287 }
 288
 289 }  // namespace internal
 290 }  // namespace arrow