]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <cstdint> | |
21 | ||
22 | #include "arrow/util/endian.h" | |
23 | #include "parquet/platform.h" | |
24 | #include "parquet/schema.h" | |
25 | ||
26 | namespace parquet { | |
27 | namespace internal { | |
28 | ||
29 | struct PARQUET_EXPORT LevelInfo { | |
30 | LevelInfo() | |
31 | : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} | |
32 | LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level, | |
33 | int32_t repeated_ancestor_definition_level) | |
34 | : null_slot_usage(null_slots), | |
35 | def_level(definition_level), | |
36 | rep_level(repetition_level), | |
37 | repeated_ancestor_def_level(repeated_ancestor_definition_level) {} | |
38 | ||
39 | bool operator==(const LevelInfo& b) const { | |
40 | return null_slot_usage == b.null_slot_usage && def_level == b.def_level && | |
41 | rep_level == b.rep_level && | |
42 | repeated_ancestor_def_level == b.repeated_ancestor_def_level; | |
43 | } | |
44 | ||
45 | bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; } | |
46 | ||
47 | // How many slots an undefined but present (i.e. null) element in | |
48 | // parquet consumes when decoding to Arrow. | |
49 | // "Slot" is used in the same context as the Arrow specification | |
50 | // (i.e. a value holder). | |
51 | // This is only ever >1 for descendents of FixedSizeList. | |
52 | int32_t null_slot_usage = 1; | |
53 | ||
54 | // The definition level at which the value for the field | |
55 | // is considered not null (definition levels greater than | |
56 | // or equal to this value indicate a not-null | |
57 | // value for the field). For list fields definition levels | |
58 | // greater than or equal to this field indicate a present, | |
59 | // possibly null, child value. | |
60 | int16_t def_level = 0; | |
61 | ||
62 | // The repetition level corresponding to this element | |
63 | // or the closest repeated ancestor. Any repetition | |
64 | // level less than this indicates either a new list OR | |
65 | // an empty list (which is determined in conjunction | |
66 | // with definition levels). | |
67 | int16_t rep_level = 0; | |
68 | ||
69 | // The definition level indicating the level at which the closest | |
70 | // repeated ancestor is not empty. This is used to discriminate | |
71 | // between a value less than |def_level| being null or excluded entirely. | |
72 | // For instance if we have an arrow schema like: | |
73 | // list(struct(f0: int)). Then then there are the following | |
74 | // definition levels: | |
75 | // 0 = null list | |
76 | // 1 = present but empty list. | |
77 | // 2 = a null value in the list | |
78 | // 3 = a non null struct but null integer. | |
79 | // 4 = a present integer. | |
80 | // When reconstructing, the struct and integer arrays' | |
81 | // repeated_ancestor_def_level would be 2. Any | |
82 | // def_level < 2 indicates that there isn't a corresponding | |
83 | // child value in the list. | |
84 | // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]] | |
85 | // has the def levels [0, 1, 2, 3, 4]. The actual | |
86 | // struct array is only of length 3: [not-set, set, set] and | |
87 | // the int array is also of length 3: [N/A, null, 1]. | |
88 | // | |
89 | int16_t repeated_ancestor_def_level = 0; | |
90 | ||
91 | /// Increments levels according to the cardinality of node. | |
92 | void Increment(const schema::Node& node) { | |
93 | if (node.is_repeated()) { | |
94 | IncrementRepeated(); | |
95 | return; | |
96 | } | |
97 | if (node.is_optional()) { | |
98 | IncrementOptional(); | |
99 | return; | |
100 | } | |
101 | } | |
102 | ||
103 | /// Incremetns level for a optional node. | |
104 | void IncrementOptional() { def_level++; } | |
105 | ||
106 | /// Increments levels for the repeated node. Returns | |
107 | /// the previous ancestor_list_def_level. | |
108 | int16_t IncrementRepeated() { | |
109 | int16_t last_repeated_ancestor = repeated_ancestor_def_level; | |
110 | ||
111 | // Repeated fields add both a repetition and definition level. This is used | |
112 | // to distinguish between an empty list and a list with an item in it. | |
113 | ++rep_level; | |
114 | ++def_level; | |
115 | // For levels >= repeated_ancenstor_def_level it indicates the list was | |
116 | // non-null and had at least one element. This is important | |
117 | // for later decoding because we need to add a slot for these | |
118 | // values. for levels < current_def_level no slots are added | |
119 | // to arrays. | |
120 | repeated_ancestor_def_level = def_level; | |
121 | return last_repeated_ancestor; | |
122 | } | |
123 | ||
124 | friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) { | |
125 | // This print method is to silence valgrind issues. What's printed | |
126 | // is not important because all asserts happen directly on | |
127 | // members. | |
128 | os << "{def=" << levels.def_level << ", rep=" << levels.rep_level | |
129 | << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level; | |
130 | if (levels.null_slot_usage > 1) { | |
131 | os << ", null_slot_usage=" << levels.null_slot_usage; | |
132 | } | |
133 | os << "}"; | |
134 | return os; | |
135 | } | |
136 | }; | |
137 | ||
138 | // Input/Output structure for reconstructed validity bitmaps. | |
139 | struct PARQUET_EXPORT ValidityBitmapInputOutput { | |
140 | // Input only. | |
141 | // The maximum number of values_read expected (actual | |
142 | // values read must be less than or equal to this value). | |
143 | // If this number is exceeded methods will throw a | |
144 | // ParquetException. Exceeding this limit indicates | |
145 | // either a corrupt or incorrectly written file. | |
146 | int64_t values_read_upper_bound = 0; | |
147 | // Output only. The number of values added to the encountered | |
148 | // (this is logically the count of the number of elements | |
149 | // for an Arrow array). | |
150 | int64_t values_read = 0; | |
151 | // Input/Output. The number of nulls encountered. | |
152 | int64_t null_count = 0; | |
153 | // Output only. The validity bitmap to populate. May be be null only | |
154 | // for DefRepLevelsToListInfo (if all that is needed is list offsets). | |
155 | uint8_t* valid_bits = NULLPTR; | |
156 | // Input only, offset into valid_bits to start at. | |
157 | int64_t valid_bits_offset = 0; | |
158 | }; | |
159 | ||
160 | // Converts def_levels to validity bitmaps for non-list arrays and structs that have | |
161 | // at least one member that is not a list and has no list descendents. | |
162 | // For lists use DefRepLevelsToList and structs where all descendants contain | |
163 | // a list use DefRepLevelsToBitmap. | |
164 | void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, | |
165 | LevelInfo level_info, | |
166 | ValidityBitmapInputOutput* output); | |
167 | ||
168 | // Reconstructs a validity bitmap and list offsets for a list arrays based on | |
169 | // def/rep levels. The first element of offsets will not be modified if rep_levels | |
170 | // starts with a new list. The first element of offsets will be used when calculating | |
171 | // the next offset. See documentation onf DefLevelsToBitmap for when to use this | |
172 | // method vs the other ones in this file for reconstruction. | |
173 | // | |
174 | // Offsets must be sized to 1 + values_read_upper_bound. | |
175 | void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, | |
176 | const int16_t* rep_levels, int64_t num_def_levels, | |
177 | LevelInfo level_info, | |
178 | ValidityBitmapInputOutput* output, | |
179 | int32_t* offsets); | |
180 | void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, | |
181 | const int16_t* rep_levels, int64_t num_def_levels, | |
182 | LevelInfo level_info, | |
183 | ValidityBitmapInputOutput* output, | |
184 | int64_t* offsets); | |
185 | ||
186 | // Reconstructs a validity bitmap for a struct every member is a list or has | |
187 | // a list descendant. See documentation on DefLevelsToBitmap for when more | |
188 | // details on this method compared to the other ones defined above. | |
189 | void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels, | |
190 | const int16_t* rep_levels, | |
191 | int64_t num_def_levels, LevelInfo level_info, | |
192 | ValidityBitmapInputOutput* output); | |
193 | ||
194 | // This is exposed to ensure we can properly test a software simulated pext function | |
195 | // (i.e. it isn't hidden by runtime dispatch). | |
196 | uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection); | |
197 | ||
198 | } // namespace internal | |
199 | } // namespace parquet |