]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <algorithm> | |
21 | #include <cstddef> | |
22 | #include <cstdint> | |
23 | #include <memory> | |
24 | #include <string> | |
25 | #include <utility> | |
26 | ||
27 | #include "parquet/platform.h" | |
28 | #include "parquet/types.h" | |
29 | ||
30 | namespace arrow { | |
31 | ||
32 | class Array; | |
33 | class BinaryArray; | |
34 | ||
35 | } // namespace arrow | |
36 | ||
37 | namespace parquet { | |
38 | ||
39 | class ColumnDescriptor; | |
40 | ||
41 | // ---------------------------------------------------------------------- | |
42 | // Value comparator interfaces | |
43 | ||
44 | /// \brief Base class for value comparators. Generally used with | |
45 | /// TypedComparator<T> | |
46 | class PARQUET_EXPORT Comparator { | |
47 | public: | |
48 | virtual ~Comparator() {} | |
49 | ||
50 | /// \brief Create a comparator explicitly from physical type and | |
51 | /// sort order | |
52 | /// \param[in] physical_type the physical type for the typed | |
53 | /// comparator | |
54 | /// \param[in] sort_order either SortOrder::SIGNED or | |
55 | /// SortOrder::UNSIGNED | |
56 | /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only | |
57 | static std::shared_ptr<Comparator> Make(Type::type physical_type, | |
58 | SortOrder::type sort_order, | |
59 | int type_length = -1); | |
60 | ||
61 | /// \brief Create typed comparator inferring default sort order from | |
62 | /// ColumnDescriptor | |
63 | /// \param[in] descr the Parquet column schema | |
64 | static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr); | |
65 | }; | |
66 | ||
67 | /// \brief Interface for comparison of physical types according to the | |
68 | /// semantics of a particular logical type. | |
69 | template <typename DType> | |
70 | class TypedComparator : public Comparator { | |
71 | public: | |
72 | using T = typename DType::c_type; | |
73 | ||
74 | /// \brief Scalar comparison of two elements, return true if first | |
75 | /// is strictly less than the second | |
76 | virtual bool Compare(const T& a, const T& b) = 0; | |
77 | ||
78 | /// \brief Compute maximum and minimum elements in a batch of | |
79 | /// elements without any nulls | |
80 | virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0; | |
81 | ||
82 | /// \brief Compute minimum and maximum elements from an Arrow array. Only | |
83 | /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY | |
84 | /// / arrow::BinaryArray | |
85 | virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0; | |
86 | ||
87 | /// \brief Compute maximum and minimum elements in a batch of | |
88 | /// elements with accompanying bitmap indicating which elements are | |
89 | /// included (bit set) and excluded (bit not set) | |
90 | /// | |
91 | /// \param[in] values the sequence of values | |
92 | /// \param[in] length the length of the sequence | |
93 | /// \param[in] valid_bits a bitmap indicating which elements are | |
94 | /// included (1) or excluded (0) | |
95 | /// \param[in] valid_bits_offset the bit offset into the bitmap of | |
96 | /// the first element in the sequence | |
97 | virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length, | |
98 | const uint8_t* valid_bits, | |
99 | int64_t valid_bits_offset) = 0; | |
100 | }; | |
101 | ||
102 | /// \brief Typed version of Comparator::Make | |
103 | template <typename DType> | |
104 | std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type, | |
105 | SortOrder::type sort_order, | |
106 | int type_length = -1) { | |
107 | return std::static_pointer_cast<TypedComparator<DType>>( | |
108 | Comparator::Make(physical_type, sort_order, type_length)); | |
109 | } | |
110 | ||
111 | /// \brief Typed version of Comparator::Make | |
112 | template <typename DType> | |
113 | std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) { | |
114 | return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr)); | |
115 | } | |
116 | ||
117 | // ---------------------------------------------------------------------- | |
118 | ||
119 | /// \brief Structure represented encoded statistics to be written to | |
120 | /// and from Parquet serialized metadata | |
121 | class PARQUET_EXPORT EncodedStatistics { | |
122 | std::shared_ptr<std::string> max_, min_; | |
123 | bool is_signed_ = false; | |
124 | ||
125 | public: | |
126 | EncodedStatistics() | |
127 | : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {} | |
128 | ||
129 | const std::string& max() const { return *max_; } | |
130 | const std::string& min() const { return *min_; } | |
131 | ||
132 | int64_t null_count = 0; | |
133 | int64_t distinct_count = 0; | |
134 | ||
135 | bool has_min = false; | |
136 | bool has_max = false; | |
137 | bool has_null_count = false; | |
138 | bool has_distinct_count = false; | |
139 | ||
140 | // From parquet-mr | |
141 | // Don't write stats larger than the max size rather than truncating. The | |
142 | // rationale is that some engines may use the minimum value in the page as | |
143 | // the true minimum for aggregations and there is no way to mark that a | |
144 | // value has been truncated and is a lower bound and not in the page. | |
145 | void ApplyStatSizeLimits(size_t length) { | |
146 | if (max_->length() > length) { | |
147 | has_max = false; | |
148 | } | |
149 | if (min_->length() > length) { | |
150 | has_min = false; | |
151 | } | |
152 | } | |
153 | ||
154 | bool is_set() const { | |
155 | return has_min || has_max || has_null_count || has_distinct_count; | |
156 | } | |
157 | ||
158 | bool is_signed() const { return is_signed_; } | |
159 | ||
160 | void set_is_signed(bool is_signed) { is_signed_ = is_signed; } | |
161 | ||
162 | EncodedStatistics& set_max(const std::string& value) { | |
163 | *max_ = value; | |
164 | has_max = true; | |
165 | return *this; | |
166 | } | |
167 | ||
168 | EncodedStatistics& set_min(const std::string& value) { | |
169 | *min_ = value; | |
170 | has_min = true; | |
171 | return *this; | |
172 | } | |
173 | ||
174 | EncodedStatistics& set_null_count(int64_t value) { | |
175 | null_count = value; | |
176 | has_null_count = true; | |
177 | return *this; | |
178 | } | |
179 | ||
180 | EncodedStatistics& set_distinct_count(int64_t value) { | |
181 | distinct_count = value; | |
182 | has_distinct_count = true; | |
183 | return *this; | |
184 | } | |
185 | }; | |
186 | ||
187 | /// \brief Base type for computing column statistics while writing a file | |
188 | class PARQUET_EXPORT Statistics { | |
189 | public: | |
190 | virtual ~Statistics() {} | |
191 | ||
192 | /// \brief Create a new statistics instance given a column schema | |
193 | /// definition | |
194 | /// \param[in] descr the column schema | |
195 | /// \param[in] pool a memory pool to use for any memory allocations, optional | |
196 | static std::shared_ptr<Statistics> Make( | |
197 | const ColumnDescriptor* descr, | |
198 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); | |
199 | ||
200 | /// \brief Create a new statistics instance given a column schema | |
201 | /// definition and pre-existing state | |
202 | /// \param[in] descr the column schema | |
203 | /// \param[in] encoded_min the encoded minimum value | |
204 | /// \param[in] encoded_max the encoded maximum value | |
205 | /// \param[in] num_values total number of values | |
206 | /// \param[in] null_count number of null values | |
207 | /// \param[in] distinct_count number of distinct values | |
208 | /// \param[in] has_min_max whether the min/max statistics are set | |
209 | /// \param[in] has_null_count whether the null_count statistics are set | |
210 | /// \param[in] has_distinct_count whether the distinct_count statistics are set | |
211 | /// \param[in] pool a memory pool to use for any memory allocations, optional | |
212 | static std::shared_ptr<Statistics> Make( | |
213 | const ColumnDescriptor* descr, const std::string& encoded_min, | |
214 | const std::string& encoded_max, int64_t num_values, int64_t null_count, | |
215 | int64_t distinct_count, bool has_min_max, bool has_null_count, | |
216 | bool has_distinct_count, | |
217 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); | |
218 | ||
219 | /// \brief Return true if the count of null values is set | |
220 | virtual bool HasNullCount() const = 0; | |
221 | ||
222 | /// \brief The number of null values, may not be set | |
223 | virtual int64_t null_count() const = 0; | |
224 | ||
225 | /// \brief Return true if the count of distinct values is set | |
226 | virtual bool HasDistinctCount() const = 0; | |
227 | ||
228 | /// \brief The number of distinct values, may not be set | |
229 | virtual int64_t distinct_count() const = 0; | |
230 | ||
231 | /// \brief The total number of values in the column | |
232 | virtual int64_t num_values() const = 0; | |
233 | ||
234 | /// \brief Return true if the min and max statistics are set. Obtain | |
235 | /// with TypedStatistics<T>::min and max | |
236 | virtual bool HasMinMax() const = 0; | |
237 | ||
238 | /// \brief Reset state of object to initial (no data observed) state | |
239 | virtual void Reset() = 0; | |
240 | ||
241 | /// \brief Plain-encoded minimum value | |
242 | virtual std::string EncodeMin() const = 0; | |
243 | ||
244 | /// \brief Plain-encoded maximum value | |
245 | virtual std::string EncodeMax() const = 0; | |
246 | ||
247 | /// \brief The finalized encoded form of the statistics for transport | |
248 | virtual EncodedStatistics Encode() = 0; | |
249 | ||
250 | /// \brief The physical type of the column schema | |
251 | virtual Type::type physical_type() const = 0; | |
252 | ||
253 | /// \brief The full type descriptor from the column schema | |
254 | virtual const ColumnDescriptor* descr() const = 0; | |
255 | ||
256 | /// \brief Check two Statistics for equality | |
257 | virtual bool Equals(const Statistics& other) const = 0; | |
258 | ||
259 | protected: | |
260 | static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min, | |
261 | const void* max, int64_t num_values, | |
262 | int64_t null_count, int64_t distinct_count); | |
263 | }; | |
264 | ||
265 | /// \brief A typed implementation of Statistics | |
266 | template <typename DType> | |
267 | class TypedStatistics : public Statistics { | |
268 | public: | |
269 | using T = typename DType::c_type; | |
270 | ||
271 | /// \brief The current minimum value | |
272 | virtual const T& min() const = 0; | |
273 | ||
274 | /// \brief The current maximum value | |
275 | virtual const T& max() const = 0; | |
276 | ||
277 | /// \brief Update state with state of another Statistics object | |
278 | virtual void Merge(const TypedStatistics<DType>& other) = 0; | |
279 | ||
280 | /// \brief Batch statistics update | |
281 | virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0; | |
282 | ||
283 | /// \brief Batch statistics update with supplied validity bitmap | |
284 | /// \param[in] values pointer to column values | |
285 | /// \param[in] valid_bits Pointer to bitmap representing if values are non-null. | |
286 | /// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of | |
287 | /// data begins. | |
288 | /// \param[in] num_spaced_values The length of values in values/valid_bits to inspect | |
289 | /// when calculating statistics. This can be smaller than | |
290 | /// num_not_null+num_null as num_null can include nulls | |
291 | /// from parents while num_spaced_values does not. | |
292 | /// \param[in] num_not_null Number of values that are not null. | |
293 | /// \param[in] num_null Number of values that are null. | |
294 | virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits, | |
295 | int64_t valid_bits_offset, int64_t num_spaced_values, | |
296 | int64_t num_not_null, int64_t num_null) = 0; | |
297 | ||
298 | /// \brief EXPERIMENTAL: Update statistics with an Arrow array without | |
299 | /// conversion to a primitive Parquet C type. Only implemented for certain | |
300 | /// Parquet type / Arrow type combinations like BYTE_ARRAY / | |
301 | /// arrow::BinaryArray | |
302 | /// | |
303 | /// If update_counts is true then the null_count and num_values will be updated | |
304 | /// based on the null_count of values. Set to false if these are updated | |
305 | /// elsewhere (e.g. when updating a dictionary where the counts are taken from | |
306 | /// the indices and not the values) | |
307 | virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0; | |
308 | ||
309 | /// \brief Set min and max values to particular values | |
310 | virtual void SetMinMax(const T& min, const T& max) = 0; | |
311 | ||
312 | /// \brief Increments the null count directly | |
313 | /// Use Update to extract the null count from data. Use this if you determine | |
314 | /// the null count through some other means (e.g. dictionary arrays where the | |
315 | /// null count is determined from the indices) | |
316 | virtual void IncrementNullCount(int64_t n) = 0; | |
317 | ||
318 | /// \brief Increments the number ov values directly | |
319 | /// The same note on IncrementNullCount applies here | |
320 | virtual void IncrementNumValues(int64_t n) = 0; | |
321 | }; | |
322 | ||
323 | using BoolStatistics = TypedStatistics<BooleanType>; | |
324 | using Int32Statistics = TypedStatistics<Int32Type>; | |
325 | using Int64Statistics = TypedStatistics<Int64Type>; | |
326 | using FloatStatistics = TypedStatistics<FloatType>; | |
327 | using DoubleStatistics = TypedStatistics<DoubleType>; | |
328 | using ByteArrayStatistics = TypedStatistics<ByteArrayType>; | |
329 | using FLBAStatistics = TypedStatistics<FLBAType>; | |
330 | ||
331 | /// \brief Typed version of Statistics::Make | |
332 | template <typename DType> | |
333 | std::shared_ptr<TypedStatistics<DType>> MakeStatistics( | |
334 | const ColumnDescriptor* descr, | |
335 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { | |
336 | return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool)); | |
337 | } | |
338 | ||
339 | /// \brief Create Statistics initialized to a particular state | |
340 | /// \param[in] min the minimum value | |
341 | /// \param[in] max the minimum value | |
342 | /// \param[in] num_values number of values | |
343 | /// \param[in] null_count number of null values | |
344 | /// \param[in] distinct_count number of distinct values | |
345 | template <typename DType> | |
346 | std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min, | |
347 | const typename DType::c_type& max, | |
348 | int64_t num_values, | |
349 | int64_t null_count, | |
350 | int64_t distinct_count) { | |
351 | return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make( | |
352 | DType::type_num, &min, &max, num_values, null_count, distinct_count)); | |
353 | } | |
354 | ||
355 | /// \brief Typed version of Statistics::Make | |
356 | template <typename DType> | |
357 | std::shared_ptr<TypedStatistics<DType>> MakeStatistics( | |
358 | const ColumnDescriptor* descr, const std::string& encoded_min, | |
359 | const std::string& encoded_max, int64_t num_values, int64_t null_count, | |
360 | int64_t distinct_count, bool has_min_max, bool has_null_count, | |
361 | bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { | |
362 | return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make( | |
363 | descr, encoded_min, encoded_max, num_values, null_count, distinct_count, | |
364 | has_min_max, has_null_count, has_distinct_count, pool)); | |
365 | } | |
366 | ||
367 | } // namespace parquet |