]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/statistics.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / statistics.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <algorithm>
21#include <cstddef>
22#include <cstdint>
23#include <memory>
24#include <string>
25#include <utility>
26
27#include "parquet/platform.h"
28#include "parquet/types.h"
29
30namespace arrow {
31
32class Array;
33class BinaryArray;
34
35} // namespace arrow
36
37namespace parquet {
38
39class ColumnDescriptor;
40
41// ----------------------------------------------------------------------
42// Value comparator interfaces
43
44/// \brief Base class for value comparators. Generally used with
45/// TypedComparator<T>
46class PARQUET_EXPORT Comparator {
47 public:
48 virtual ~Comparator() {}
49
50 /// \brief Create a comparator explicitly from physical type and
51 /// sort order
52 /// \param[in] physical_type the physical type for the typed
53 /// comparator
54 /// \param[in] sort_order either SortOrder::SIGNED or
55 /// SortOrder::UNSIGNED
56 /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
57 static std::shared_ptr<Comparator> Make(Type::type physical_type,
58 SortOrder::type sort_order,
59 int type_length = -1);
60
61 /// \brief Create typed comparator inferring default sort order from
62 /// ColumnDescriptor
63 /// \param[in] descr the Parquet column schema
64 static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
65};
66
67/// \brief Interface for comparison of physical types according to the
68/// semantics of a particular logical type.
69template <typename DType>
70class TypedComparator : public Comparator {
71 public:
72 using T = typename DType::c_type;
73
74 /// \brief Scalar comparison of two elements, return true if first
75 /// is strictly less than the second
76 virtual bool Compare(const T& a, const T& b) = 0;
77
78 /// \brief Compute maximum and minimum elements in a batch of
79 /// elements without any nulls
80 virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
81
82 /// \brief Compute minimum and maximum elements from an Arrow array. Only
83 /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
84 /// / arrow::BinaryArray
85 virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
86
87 /// \brief Compute maximum and minimum elements in a batch of
88 /// elements with accompanying bitmap indicating which elements are
89 /// included (bit set) and excluded (bit not set)
90 ///
91 /// \param[in] values the sequence of values
92 /// \param[in] length the length of the sequence
93 /// \param[in] valid_bits a bitmap indicating which elements are
94 /// included (1) or excluded (0)
95 /// \param[in] valid_bits_offset the bit offset into the bitmap of
96 /// the first element in the sequence
97 virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
98 const uint8_t* valid_bits,
99 int64_t valid_bits_offset) = 0;
100};
101
102/// \brief Typed version of Comparator::Make
103template <typename DType>
104std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
105 SortOrder::type sort_order,
106 int type_length = -1) {
107 return std::static_pointer_cast<TypedComparator<DType>>(
108 Comparator::Make(physical_type, sort_order, type_length));
109}
110
111/// \brief Typed version of Comparator::Make
112template <typename DType>
113std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
114 return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
115}
116
117// ----------------------------------------------------------------------
118
119/// \brief Structure represented encoded statistics to be written to
120/// and from Parquet serialized metadata
121class PARQUET_EXPORT EncodedStatistics {
122 std::shared_ptr<std::string> max_, min_;
123 bool is_signed_ = false;
124
125 public:
126 EncodedStatistics()
127 : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
128
129 const std::string& max() const { return *max_; }
130 const std::string& min() const { return *min_; }
131
132 int64_t null_count = 0;
133 int64_t distinct_count = 0;
134
135 bool has_min = false;
136 bool has_max = false;
137 bool has_null_count = false;
138 bool has_distinct_count = false;
139
140 // From parquet-mr
141 // Don't write stats larger than the max size rather than truncating. The
142 // rationale is that some engines may use the minimum value in the page as
143 // the true minimum for aggregations and there is no way to mark that a
144 // value has been truncated and is a lower bound and not in the page.
145 void ApplyStatSizeLimits(size_t length) {
146 if (max_->length() > length) {
147 has_max = false;
148 }
149 if (min_->length() > length) {
150 has_min = false;
151 }
152 }
153
154 bool is_set() const {
155 return has_min || has_max || has_null_count || has_distinct_count;
156 }
157
158 bool is_signed() const { return is_signed_; }
159
160 void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
161
162 EncodedStatistics& set_max(const std::string& value) {
163 *max_ = value;
164 has_max = true;
165 return *this;
166 }
167
168 EncodedStatistics& set_min(const std::string& value) {
169 *min_ = value;
170 has_min = true;
171 return *this;
172 }
173
174 EncodedStatistics& set_null_count(int64_t value) {
175 null_count = value;
176 has_null_count = true;
177 return *this;
178 }
179
180 EncodedStatistics& set_distinct_count(int64_t value) {
181 distinct_count = value;
182 has_distinct_count = true;
183 return *this;
184 }
185};
186
187/// \brief Base type for computing column statistics while writing a file
188class PARQUET_EXPORT Statistics {
189 public:
190 virtual ~Statistics() {}
191
192 /// \brief Create a new statistics instance given a column schema
193 /// definition
194 /// \param[in] descr the column schema
195 /// \param[in] pool a memory pool to use for any memory allocations, optional
196 static std::shared_ptr<Statistics> Make(
197 const ColumnDescriptor* descr,
198 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
199
200 /// \brief Create a new statistics instance given a column schema
201 /// definition and pre-existing state
202 /// \param[in] descr the column schema
203 /// \param[in] encoded_min the encoded minimum value
204 /// \param[in] encoded_max the encoded maximum value
205 /// \param[in] num_values total number of values
206 /// \param[in] null_count number of null values
207 /// \param[in] distinct_count number of distinct values
208 /// \param[in] has_min_max whether the min/max statistics are set
209 /// \param[in] has_null_count whether the null_count statistics are set
210 /// \param[in] has_distinct_count whether the distinct_count statistics are set
211 /// \param[in] pool a memory pool to use for any memory allocations, optional
212 static std::shared_ptr<Statistics> Make(
213 const ColumnDescriptor* descr, const std::string& encoded_min,
214 const std::string& encoded_max, int64_t num_values, int64_t null_count,
215 int64_t distinct_count, bool has_min_max, bool has_null_count,
216 bool has_distinct_count,
217 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
218
219 /// \brief Return true if the count of null values is set
220 virtual bool HasNullCount() const = 0;
221
222 /// \brief The number of null values, may not be set
223 virtual int64_t null_count() const = 0;
224
225 /// \brief Return true if the count of distinct values is set
226 virtual bool HasDistinctCount() const = 0;
227
228 /// \brief The number of distinct values, may not be set
229 virtual int64_t distinct_count() const = 0;
230
231 /// \brief The total number of values in the column
232 virtual int64_t num_values() const = 0;
233
234 /// \brief Return true if the min and max statistics are set. Obtain
235 /// with TypedStatistics<T>::min and max
236 virtual bool HasMinMax() const = 0;
237
238 /// \brief Reset state of object to initial (no data observed) state
239 virtual void Reset() = 0;
240
241 /// \brief Plain-encoded minimum value
242 virtual std::string EncodeMin() const = 0;
243
244 /// \brief Plain-encoded maximum value
245 virtual std::string EncodeMax() const = 0;
246
247 /// \brief The finalized encoded form of the statistics for transport
248 virtual EncodedStatistics Encode() = 0;
249
250 /// \brief The physical type of the column schema
251 virtual Type::type physical_type() const = 0;
252
253 /// \brief The full type descriptor from the column schema
254 virtual const ColumnDescriptor* descr() const = 0;
255
256 /// \brief Check two Statistics for equality
257 virtual bool Equals(const Statistics& other) const = 0;
258
259 protected:
260 static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
261 const void* max, int64_t num_values,
262 int64_t null_count, int64_t distinct_count);
263};
264
265/// \brief A typed implementation of Statistics
266template <typename DType>
267class TypedStatistics : public Statistics {
268 public:
269 using T = typename DType::c_type;
270
271 /// \brief The current minimum value
272 virtual const T& min() const = 0;
273
274 /// \brief The current maximum value
275 virtual const T& max() const = 0;
276
277 /// \brief Update state with state of another Statistics object
278 virtual void Merge(const TypedStatistics<DType>& other) = 0;
279
280 /// \brief Batch statistics update
281 virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
282
283 /// \brief Batch statistics update with supplied validity bitmap
284 /// \param[in] values pointer to column values
285 /// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
286 /// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
287 /// data begins.
288 /// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
289 /// when calculating statistics. This can be smaller than
290 /// num_not_null+num_null as num_null can include nulls
291 /// from parents while num_spaced_values does not.
292 /// \param[in] num_not_null Number of values that are not null.
293 /// \param[in] num_null Number of values that are null.
294 virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
295 int64_t valid_bits_offset, int64_t num_spaced_values,
296 int64_t num_not_null, int64_t num_null) = 0;
297
298 /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
299 /// conversion to a primitive Parquet C type. Only implemented for certain
300 /// Parquet type / Arrow type combinations like BYTE_ARRAY /
301 /// arrow::BinaryArray
302 ///
303 /// If update_counts is true then the null_count and num_values will be updated
304 /// based on the null_count of values. Set to false if these are updated
305 /// elsewhere (e.g. when updating a dictionary where the counts are taken from
306 /// the indices and not the values)
307 virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
308
309 /// \brief Set min and max values to particular values
310 virtual void SetMinMax(const T& min, const T& max) = 0;
311
312 /// \brief Increments the null count directly
313 /// Use Update to extract the null count from data. Use this if you determine
314 /// the null count through some other means (e.g. dictionary arrays where the
315 /// null count is determined from the indices)
316 virtual void IncrementNullCount(int64_t n) = 0;
317
318 /// \brief Increments the number ov values directly
319 /// The same note on IncrementNullCount applies here
320 virtual void IncrementNumValues(int64_t n) = 0;
321};
322
323using BoolStatistics = TypedStatistics<BooleanType>;
324using Int32Statistics = TypedStatistics<Int32Type>;
325using Int64Statistics = TypedStatistics<Int64Type>;
326using FloatStatistics = TypedStatistics<FloatType>;
327using DoubleStatistics = TypedStatistics<DoubleType>;
328using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
329using FLBAStatistics = TypedStatistics<FLBAType>;
330
331/// \brief Typed version of Statistics::Make
332template <typename DType>
333std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
334 const ColumnDescriptor* descr,
335 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
336 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
337}
338
339/// \brief Create Statistics initialized to a particular state
340/// \param[in] min the minimum value
341/// \param[in] max the minimum value
342/// \param[in] num_values number of values
343/// \param[in] null_count number of null values
344/// \param[in] distinct_count number of distinct values
345template <typename DType>
346std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
347 const typename DType::c_type& max,
348 int64_t num_values,
349 int64_t null_count,
350 int64_t distinct_count) {
351 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
352 DType::type_num, &min, &max, num_values, null_count, distinct_count));
353}
354
355/// \brief Typed version of Statistics::Make
356template <typename DType>
357std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
358 const ColumnDescriptor* descr, const std::string& encoded_min,
359 const std::string& encoded_max, int64_t num_values, int64_t null_count,
360 int64_t distinct_count, bool has_min_max, bool has_null_count,
361 bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
362 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
363 descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
364 has_min_max, has_null_count, has_distinct_count, pool));
365}
366
367} // namespace parquet