]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/level_conversion.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / level_conversion.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cstdint>
21
22#include "arrow/util/endian.h"
23#include "parquet/platform.h"
24#include "parquet/schema.h"
25
26namespace parquet {
27namespace internal {
28
29struct PARQUET_EXPORT LevelInfo {
30 LevelInfo()
31 : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
32 LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
33 int32_t repeated_ancestor_definition_level)
34 : null_slot_usage(null_slots),
35 def_level(definition_level),
36 rep_level(repetition_level),
37 repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
38
39 bool operator==(const LevelInfo& b) const {
40 return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
41 rep_level == b.rep_level &&
42 repeated_ancestor_def_level == b.repeated_ancestor_def_level;
43 }
44
45 bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
46
47 // How many slots an undefined but present (i.e. null) element in
48 // parquet consumes when decoding to Arrow.
49 // "Slot" is used in the same context as the Arrow specification
50 // (i.e. a value holder).
51 // This is only ever >1 for descendents of FixedSizeList.
52 int32_t null_slot_usage = 1;
53
54 // The definition level at which the value for the field
55 // is considered not null (definition levels greater than
56 // or equal to this value indicate a not-null
57 // value for the field). For list fields definition levels
58 // greater than or equal to this field indicate a present,
59 // possibly null, child value.
60 int16_t def_level = 0;
61
62 // The repetition level corresponding to this element
63 // or the closest repeated ancestor. Any repetition
64 // level less than this indicates either a new list OR
65 // an empty list (which is determined in conjunction
66 // with definition levels).
67 int16_t rep_level = 0;
68
69 // The definition level indicating the level at which the closest
70 // repeated ancestor is not empty. This is used to discriminate
71 // between a value less than |def_level| being null or excluded entirely.
72 // For instance if we have an arrow schema like:
73 // list(struct(f0: int)). Then then there are the following
74 // definition levels:
75 // 0 = null list
76 // 1 = present but empty list.
77 // 2 = a null value in the list
78 // 3 = a non null struct but null integer.
79 // 4 = a present integer.
80 // When reconstructing, the struct and integer arrays'
81 // repeated_ancestor_def_level would be 2. Any
82 // def_level < 2 indicates that there isn't a corresponding
83 // child value in the list.
84 // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
85 // has the def levels [0, 1, 2, 3, 4]. The actual
86 // struct array is only of length 3: [not-set, set, set] and
87 // the int array is also of length 3: [N/A, null, 1].
88 //
89 int16_t repeated_ancestor_def_level = 0;
90
91 /// Increments levels according to the cardinality of node.
92 void Increment(const schema::Node& node) {
93 if (node.is_repeated()) {
94 IncrementRepeated();
95 return;
96 }
97 if (node.is_optional()) {
98 IncrementOptional();
99 return;
100 }
101 }
102
103 /// Incremetns level for a optional node.
104 void IncrementOptional() { def_level++; }
105
106 /// Increments levels for the repeated node. Returns
107 /// the previous ancestor_list_def_level.
108 int16_t IncrementRepeated() {
109 int16_t last_repeated_ancestor = repeated_ancestor_def_level;
110
111 // Repeated fields add both a repetition and definition level. This is used
112 // to distinguish between an empty list and a list with an item in it.
113 ++rep_level;
114 ++def_level;
115 // For levels >= repeated_ancenstor_def_level it indicates the list was
116 // non-null and had at least one element. This is important
117 // for later decoding because we need to add a slot for these
118 // values. for levels < current_def_level no slots are added
119 // to arrays.
120 repeated_ancestor_def_level = def_level;
121 return last_repeated_ancestor;
122 }
123
124 friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
125 // This print method is to silence valgrind issues. What's printed
126 // is not important because all asserts happen directly on
127 // members.
128 os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
129 << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
130 if (levels.null_slot_usage > 1) {
131 os << ", null_slot_usage=" << levels.null_slot_usage;
132 }
133 os << "}";
134 return os;
135 }
136};
137
138// Input/Output structure for reconstructed validity bitmaps.
139struct PARQUET_EXPORT ValidityBitmapInputOutput {
140 // Input only.
141 // The maximum number of values_read expected (actual
142 // values read must be less than or equal to this value).
143 // If this number is exceeded methods will throw a
144 // ParquetException. Exceeding this limit indicates
145 // either a corrupt or incorrectly written file.
146 int64_t values_read_upper_bound = 0;
147 // Output only. The number of values added to the encountered
148 // (this is logically the count of the number of elements
149 // for an Arrow array).
150 int64_t values_read = 0;
151 // Input/Output. The number of nulls encountered.
152 int64_t null_count = 0;
153 // Output only. The validity bitmap to populate. May be be null only
154 // for DefRepLevelsToListInfo (if all that is needed is list offsets).
155 uint8_t* valid_bits = NULLPTR;
156 // Input only, offset into valid_bits to start at.
157 int64_t valid_bits_offset = 0;
158};
159
160// Converts def_levels to validity bitmaps for non-list arrays and structs that have
161// at least one member that is not a list and has no list descendents.
162// For lists use DefRepLevelsToList and structs where all descendants contain
163// a list use DefRepLevelsToBitmap.
164void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
165 LevelInfo level_info,
166 ValidityBitmapInputOutput* output);
167
168// Reconstructs a validity bitmap and list offsets for a list arrays based on
169// def/rep levels. The first element of offsets will not be modified if rep_levels
170// starts with a new list. The first element of offsets will be used when calculating
171// the next offset. See documentation onf DefLevelsToBitmap for when to use this
172// method vs the other ones in this file for reconstruction.
173//
174// Offsets must be sized to 1 + values_read_upper_bound.
175void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
176 const int16_t* rep_levels, int64_t num_def_levels,
177 LevelInfo level_info,
178 ValidityBitmapInputOutput* output,
179 int32_t* offsets);
180void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
181 const int16_t* rep_levels, int64_t num_def_levels,
182 LevelInfo level_info,
183 ValidityBitmapInputOutput* output,
184 int64_t* offsets);
185
186// Reconstructs a validity bitmap for a struct every member is a list or has
187// a list descendant. See documentation on DefLevelsToBitmap for when more
188// details on this method compared to the other ones defined above.
189void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
190 const int16_t* rep_levels,
191 int64_t num_def_levels, LevelInfo level_info,
192 ValidityBitmapInputOutput* output);
193
194// This is exposed to ensure we can properly test a software simulated pext function
195// (i.e. it isn't hidden by runtime dispatch).
196uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
197
198} // namespace internal
199} // namespace parquet