]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "parquet/arrow/path_internal.h" | |
19 | ||
20 | #include <algorithm> | |
21 | #include <memory> | |
22 | #include <utility> | |
23 | #include <vector> | |
24 | ||
25 | #include <gmock/gmock.h> | |
26 | #include <gtest/gtest.h> | |
27 | ||
28 | #include "arrow/array/concatenate.h" | |
29 | #include "arrow/chunked_array.h" | |
30 | #include "arrow/io/memory.h" | |
31 | #include "arrow/result.h" | |
32 | #include "arrow/testing/gtest_util.h" | |
33 | #include "arrow/type_fwd.h" | |
34 | #include "arrow/util/checked_cast.h" | |
35 | #include "arrow/util/logging.h" | |
36 | ||
37 | #include "parquet/arrow/reader.h" | |
38 | #include "parquet/arrow/schema.h" | |
39 | #include "parquet/column_writer.h" | |
40 | #include "parquet/file_writer.h" | |
41 | #include "parquet/properties.h" | |
42 | ||
43 | using arrow::Array; | |
44 | using arrow::ArrayFromJSON; | |
45 | using arrow::AssertArraysEqual; | |
46 | using arrow::ChunkedArray; | |
47 | using arrow::DataType; | |
48 | using arrow::field; | |
49 | using arrow::int32; | |
50 | using arrow::int64; | |
51 | using arrow::list; | |
52 | using arrow::MemoryPool; | |
53 | using arrow::Result; | |
54 | using arrow::Status; | |
55 | using arrow::struct_; | |
56 | using arrow::internal::checked_cast; | |
57 | using arrow::internal::checked_pointer_cast; | |
58 | using arrow::io::BufferOutputStream; | |
59 | using arrow::io::BufferReader; | |
60 | ||
61 | using testing::ElementsAre; | |
62 | using testing::ElementsAreArray; | |
63 | using testing::Eq; | |
64 | using testing::NotNull; | |
65 | using testing::SizeIs; | |
66 | ||
67 | namespace parquet { | |
68 | namespace arrow { | |
69 | ||
70 | using parquet::schema::GroupNode; | |
71 | using parquet::schema::NodePtr; | |
72 | using parquet::schema::PrimitiveNode; | |
73 | ||
74 | using ParquetType = parquet::Type::type; | |
75 | template <ParquetType T> | |
76 | using ParquetTraits = parquet::type_traits<T>; | |
77 | ||
78 | using LevelVector = std::vector<int16_t>; | |
79 | // For readability | |
80 | using DefLevels = LevelVector; | |
81 | using RepLevels = LevelVector; | |
82 | using Int32Vector = std::vector<int32_t>; | |
83 | using Int64Vector = std::vector<int64_t>; | |
84 | ||
85 | // A Parquet file builder that allows writing values one leaf column at a time | |
86 | class FileBuilder { | |
87 | public: | |
88 | static Result<std::shared_ptr<FileBuilder>> Make(const NodePtr& group_node, | |
89 | int num_columns) { | |
90 | auto self = std::make_shared<FileBuilder>(); | |
91 | RETURN_NOT_OK(self->Open(group_node, num_columns)); | |
92 | return self; | |
93 | } | |
94 | ||
95 | Result<std::shared_ptr<Buffer>> Finish() { | |
96 | DCHECK_EQ(column_index_, num_columns_); | |
97 | row_group_writer_->Close(); | |
98 | file_writer_->Close(); | |
99 | return stream_->Finish(); | |
100 | } | |
101 | ||
102 | // Write a leaf (primitive) column | |
103 | template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type> | |
104 | Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels, | |
105 | const std::vector<C_TYPE>& values) { | |
106 | auto column_writer = row_group_writer_->NextColumn(); | |
107 | auto column_descr = column_writer->descr(); | |
108 | const int16_t max_def_level = column_descr->max_definition_level(); | |
109 | const int16_t max_rep_level = column_descr->max_repetition_level(); | |
110 | CheckTestedLevels(def_levels, max_def_level); | |
111 | CheckTestedLevels(rep_levels, max_rep_level); | |
112 | ||
113 | auto typed_writer = | |
114 | checked_cast<TypedColumnWriter<PhysicalType<TYPE>>*>(column_writer); | |
115 | ||
116 | const int64_t num_values = static_cast<int64_t>( | |
117 | (max_def_level > 0) ? def_levels.size() | |
118 | : (max_rep_level > 0) ? rep_levels.size() : values.size()); | |
119 | const int64_t values_written = typed_writer->WriteBatch( | |
120 | num_values, LevelPointerOrNull(def_levels, max_def_level), | |
121 | LevelPointerOrNull(rep_levels, max_rep_level), values.data()); | |
122 | DCHECK_EQ(values_written, static_cast<int64_t>(values.size())); // Sanity check | |
123 | ||
124 | column_writer->Close(); | |
125 | ++column_index_; | |
126 | return Status::OK(); | |
127 | } | |
128 | ||
129 | protected: | |
130 | Status Open(const NodePtr& group_node, int num_columns) { | |
131 | ARROW_ASSIGN_OR_RAISE(stream_, BufferOutputStream::Create()); | |
132 | file_writer_ = | |
133 | ParquetFileWriter::Open(stream_, checked_pointer_cast<GroupNode>(group_node)); | |
134 | row_group_writer_ = file_writer_->AppendRowGroup(); | |
135 | num_columns_ = num_columns; | |
136 | column_index_ = 0; | |
137 | return Status::OK(); | |
138 | } | |
139 | ||
140 | void CheckTestedLevels(const LevelVector& levels, int16_t max_level) { | |
141 | // Tests are expected to exercise all possible levels in [0, max_level] | |
142 | if (!levels.empty()) { | |
143 | const int16_t max_seen_level = *std::max_element(levels.begin(), levels.end()); | |
144 | DCHECK_EQ(max_seen_level, max_level); | |
145 | } | |
146 | } | |
147 | ||
148 | const int16_t* LevelPointerOrNull(const LevelVector& levels, int16_t max_level) { | |
149 | if (max_level > 0) { | |
150 | DCHECK_GT(levels.size(), 0); | |
151 | return levels.data(); | |
152 | } else { | |
153 | DCHECK_EQ(levels.size(), 0); | |
154 | return nullptr; | |
155 | } | |
156 | } | |
157 | ||
158 | std::shared_ptr<BufferOutputStream> stream_; | |
159 | std::unique_ptr<ParquetFileWriter> file_writer_; | |
160 | RowGroupWriter* row_group_writer_; | |
161 | int num_columns_; | |
162 | int column_index_; | |
163 | }; | |
164 | ||
165 | // A Parquet file tester that allows reading Arrow columns, corresponding to | |
166 | // children of the top-level group node. | |
167 | class FileTester { | |
168 | public: | |
169 | static Result<std::shared_ptr<FileTester>> Make(std::shared_ptr<Buffer> buffer, | |
170 | MemoryPool* pool) { | |
171 | auto self = std::make_shared<FileTester>(); | |
172 | RETURN_NOT_OK(self->Open(buffer, pool)); | |
173 | return self; | |
174 | } | |
175 | ||
176 | Result<std::shared_ptr<Array>> ReadColumn(int column_index) { | |
177 | std::shared_ptr<ChunkedArray> column; | |
178 | RETURN_NOT_OK(file_reader_->ReadColumn(column_index, &column)); | |
179 | return ::arrow::Concatenate(column->chunks(), pool_); | |
180 | } | |
181 | ||
182 | void CheckColumn(int column_index, const Array& expected) { | |
183 | ASSERT_OK_AND_ASSIGN(const auto actual, ReadColumn(column_index)); | |
184 | ASSERT_OK(actual->ValidateFull()); | |
185 | AssertArraysEqual(expected, *actual, /*verbose=*/true); | |
186 | } | |
187 | ||
188 | protected: | |
189 | Status Open(std::shared_ptr<Buffer> buffer, MemoryPool* pool) { | |
190 | pool_ = pool; | |
191 | return OpenFile(std::make_shared<BufferReader>(buffer), pool_, &file_reader_); | |
192 | } | |
193 | ||
194 | MemoryPool* pool_; | |
195 | std::unique_ptr<FileReader> file_reader_; | |
196 | }; | |
197 | ||
198 | class TestReconstructColumn : public testing::Test { | |
199 | public: | |
200 | void SetUp() override { pool_ = ::arrow::default_memory_pool(); } | |
201 | ||
202 | // Write the next leaf (primitive) column | |
203 | template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type> | |
204 | Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels, | |
205 | const std::vector<C_TYPE>& values) { | |
206 | if (!builder_) { | |
207 | ARROW_ASSIGN_OR_RAISE(builder_, | |
208 | FileBuilder::Make(group_node_, descriptor_->num_columns())); | |
209 | } | |
210 | return builder_->WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values); | |
211 | } | |
212 | ||
213 | template <typename C_TYPE> | |
214 | Status WriteInt32Column(const LevelVector& def_levels, const LevelVector& rep_levels, | |
215 | const std::vector<C_TYPE>& values) { | |
216 | return WriteColumn<ParquetType::INT32>(def_levels, rep_levels, values); | |
217 | } | |
218 | ||
219 | template <typename C_TYPE> | |
220 | Status WriteInt64Column(const LevelVector& def_levels, const LevelVector& rep_levels, | |
221 | const std::vector<C_TYPE>& values) { | |
222 | return WriteColumn<ParquetType::INT64>(def_levels, rep_levels, values); | |
223 | } | |
224 | ||
225 | // Read a Arrow column and check its values | |
226 | void CheckColumn(int column_index, const Array& expected) { | |
227 | if (!tester_) { | |
228 | ASSERT_OK_AND_ASSIGN(auto buffer, builder_->Finish()); | |
229 | ASSERT_OK_AND_ASSIGN(tester_, FileTester::Make(buffer, pool_)); | |
230 | } | |
231 | tester_->CheckColumn(column_index, expected); | |
232 | } | |
233 | ||
234 | void CheckColumn(const Array& expected) { CheckColumn(/*column_index=*/0, expected); } | |
235 | ||
236 | // One-column shortcut | |
237 | template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type> | |
238 | void AssertReconstruct(const Array& expected, const LevelVector& def_levels, | |
239 | const LevelVector& rep_levels, | |
240 | const std::vector<C_TYPE>& values) { | |
241 | ASSERT_OK((WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values))); | |
242 | CheckColumn(/*column_index=*/0, expected); | |
243 | } | |
244 | ||
245 | ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { | |
246 | descriptor_.reset(new SchemaDescriptor()); | |
247 | manifest_.reset(new SchemaManifest()); | |
248 | group_node_ = GroupNode::Make("root", Repetition::REQUIRED, {column}); | |
249 | descriptor_->Init(group_node_); | |
250 | return SchemaManifest::Make(descriptor_.get(), | |
251 | std::shared_ptr<const ::arrow::KeyValueMetadata>(), | |
252 | ArrowReaderProperties(), manifest_.get()); | |
253 | } | |
254 | ||
255 | void SetParquetSchema(const NodePtr& column) { | |
256 | ASSERT_OK(MaybeSetParquetSchema(column)); | |
257 | } | |
258 | ||
259 | protected: | |
260 | MemoryPool* pool_; | |
261 | NodePtr group_node_; | |
262 | std::unique_ptr<SchemaDescriptor> descriptor_; | |
263 | std::unique_ptr<SchemaManifest> manifest_; | |
264 | ||
265 | std::shared_ptr<FileBuilder> builder_; | |
266 | std::shared_ptr<FileTester> tester_; | |
267 | }; | |
268 | ||
269 | static std::shared_ptr<DataType> OneFieldStruct(const std::string& name, | |
270 | std::shared_ptr<DataType> type, | |
271 | bool nullable = true) { | |
272 | return struct_({field(name, type, nullable)}); | |
273 | } | |
274 | ||
275 | static std::shared_ptr<DataType> List(std::shared_ptr<DataType> type, | |
276 | bool nullable = true) { | |
277 | // TODO should field name "element" (Parquet convention for List nodes) | |
278 | // be changed to "item" (Arrow convention for List types)? | |
279 | return list(field("element", type, nullable)); | |
280 | } | |
281 | ||
282 | // | |
283 | // Primitive columns with no intermediate group node | |
284 | // | |
285 | ||
286 | TEST_F(TestReconstructColumn, PrimitiveOptional) { | |
287 | SetParquetSchema( | |
288 | PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::INT32)); | |
289 | ||
290 | LevelVector def_levels = {1, 0, 1, 1}; | |
291 | LevelVector rep_levels = {}; | |
292 | std::vector<int32_t> values = {4, 5, 6}; | |
293 | ||
294 | auto expected = ArrayFromJSON(int32(), "[4, null, 5, 6]"); | |
295 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
296 | } | |
297 | ||
298 | TEST_F(TestReconstructColumn, PrimitiveRequired) { | |
299 | SetParquetSchema( | |
300 | PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::INT32)); | |
301 | ||
302 | LevelVector def_levels = {}; | |
303 | LevelVector rep_levels = {}; | |
304 | std::vector<int32_t> values = {4, 5, 6}; | |
305 | ||
306 | auto expected = ArrayFromJSON(int32(), "[4, 5, 6]"); | |
307 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
308 | } | |
309 | ||
310 | TEST_F(TestReconstructColumn, PrimitiveRepeated) { | |
311 | // Arrow schema: list(int32 not null) not null | |
312 | this->SetParquetSchema( | |
313 | PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::INT32)); | |
314 | ||
315 | LevelVector def_levels = {0, 1, 1, 1}; | |
316 | LevelVector rep_levels = {0, 0, 1, 0}; | |
317 | std::vector<int32_t> values = {4, 5, 6}; | |
318 | ||
319 | auto expected = ArrayFromJSON(list(field("node_name", int32(), /*nullable=*/false)), | |
320 | "[[], [4, 5], [6]]"); | |
321 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
322 | } | |
323 | ||
324 | // | |
325 | // Struct encodings (one field each) | |
326 | // | |
327 | ||
328 | TEST_F(TestReconstructColumn, NestedRequiredRequired) { | |
329 | // Arrow schema: struct(a: int32 not null) not null | |
330 | SetParquetSchema(GroupNode::Make( | |
331 | "parent", Repetition::REQUIRED, | |
332 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)})); | |
333 | ||
334 | LevelVector def_levels = {}; | |
335 | LevelVector rep_levels = {}; | |
336 | std::vector<int32_t> values = {4, 5, 6}; | |
337 | ||
338 | auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false), | |
339 | R"([{"a": 4}, {"a": 5}, {"a": 6}])"); | |
340 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
341 | } | |
342 | ||
343 | TEST_F(TestReconstructColumn, NestedOptionalRequired) { | |
344 | // Arrow schema: struct(a: int32 not null) | |
345 | SetParquetSchema(GroupNode::Make( | |
346 | "parent", Repetition::OPTIONAL, | |
347 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)})); | |
348 | ||
349 | LevelVector def_levels = {0, 1, 1, 1}; | |
350 | LevelVector rep_levels = {}; | |
351 | std::vector<int32_t> values = {4, 5, 6}; | |
352 | ||
353 | auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false), | |
354 | R"([null, {"a": 4}, {"a": 5}, {"a": 6}])"); | |
355 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
356 | } | |
357 | ||
358 | TEST_F(TestReconstructColumn, NestedRequiredOptional) { | |
359 | // Arrow schema: struct(a: int32) not null | |
360 | SetParquetSchema(GroupNode::Make( | |
361 | "parent", Repetition::REQUIRED, | |
362 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)})); | |
363 | ||
364 | LevelVector def_levels = {0, 1, 1, 1}; | |
365 | LevelVector rep_levels = {}; | |
366 | std::vector<int32_t> values = {4, 5, 6}; | |
367 | ||
368 | auto expected = ArrayFromJSON(OneFieldStruct("a", int32()), | |
369 | R"([{"a": null}, {"a": 4}, {"a": 5}, {"a": 6}])"); | |
370 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
371 | } | |
372 | ||
373 | TEST_F(TestReconstructColumn, NestedOptionalOptional) { | |
374 | // Arrow schema: struct(a: int32) | |
375 | SetParquetSchema(GroupNode::Make( | |
376 | "parent", Repetition::OPTIONAL, | |
377 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)})); | |
378 | ||
379 | LevelVector def_levels = {0, 1, 2, 2}; | |
380 | LevelVector rep_levels = {}; | |
381 | std::vector<int32_t> values = {4, 5}; | |
382 | ||
383 | auto expected = ArrayFromJSON(OneFieldStruct("a", int32()), | |
384 | R"([null, {"a": null}, {"a": 4}, {"a": 5}])"); | |
385 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
386 | } | |
387 | ||
388 | // | |
389 | // Nested struct encodings (one field each) | |
390 | // | |
391 | ||
392 | TEST_F(TestReconstructColumn, NestedRequiredRequiredRequired) { | |
393 | // Arrow schema: struct(a: struct(b: int32 not null) not null) not null | |
394 | SetParquetSchema(GroupNode::Make( | |
395 | "parent", Repetition::REQUIRED, | |
396 | {GroupNode::Make( | |
397 | "a", Repetition::REQUIRED, | |
398 | {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})})); | |
399 | ||
400 | LevelVector def_levels = {}; | |
401 | LevelVector rep_levels = {}; | |
402 | std::vector<int32_t> values = {4, 5, 6}; | |
403 | ||
404 | auto expected = | |
405 | ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false), false), | |
406 | R"([{"a": {"b": 4}}, | |
407 | {"a": {"b": 5}}, | |
408 | {"a": {"b": 6}} | |
409 | ])"); | |
410 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
411 | } | |
412 | ||
413 | TEST_F(TestReconstructColumn, NestedRequiredOptionalRequired) { | |
414 | // Arrow schema: struct(a: struct(b: int32 not null)) not null | |
415 | SetParquetSchema(GroupNode::Make( | |
416 | "parent", Repetition::REQUIRED, | |
417 | {GroupNode::Make( | |
418 | "a", Repetition::OPTIONAL, | |
419 | {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})})); | |
420 | ||
421 | LevelVector def_levels = {1, 0, 1, 1}; | |
422 | LevelVector rep_levels = {}; | |
423 | std::vector<int32_t> values = {4, 5, 6}; | |
424 | ||
425 | auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false)), | |
426 | R"([{"a": {"b": 4}}, | |
427 | {"a": null}, | |
428 | {"a": {"b": 5}}, | |
429 | {"a": {"b": 6}} | |
430 | ])"); | |
431 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
432 | } | |
433 | ||
434 | TEST_F(TestReconstructColumn, NestedOptionalRequiredOptional) { | |
435 | // Arrow schema: struct(a: struct(b: int32) not null) | |
436 | SetParquetSchema(GroupNode::Make( | |
437 | "parent", Repetition::OPTIONAL, | |
438 | {GroupNode::Make( | |
439 | "a", Repetition::REQUIRED, | |
440 | {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})})); | |
441 | ||
442 | LevelVector def_levels = {1, 2, 0, 2, 2}; | |
443 | LevelVector rep_levels = {}; | |
444 | std::vector<int32_t> values = {4, 5, 6}; | |
445 | ||
446 | auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32()), false), | |
447 | R"([{"a": {"b": null}}, | |
448 | {"a": {"b": 4}}, | |
449 | null, | |
450 | {"a": {"b": 5}}, | |
451 | {"a": {"b": 6}} | |
452 | ])"); | |
453 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
454 | } | |
455 | ||
456 | TEST_F(TestReconstructColumn, NestedOptionalOptionalOptional) { | |
457 | // Arrow schema: struct(a: struct(b: int32) not null) | |
458 | SetParquetSchema(GroupNode::Make( | |
459 | "parent", Repetition::OPTIONAL, | |
460 | {GroupNode::Make( | |
461 | "a", Repetition::OPTIONAL, | |
462 | {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})})); | |
463 | ||
464 | LevelVector def_levels = {1, 2, 0, 3, 3, 3}; | |
465 | LevelVector rep_levels = {}; | |
466 | std::vector<int32_t> values = {4, 5, 6}; | |
467 | ||
468 | auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32())), | |
469 | R"([{"a": null}, | |
470 | {"a": {"b": null}}, | |
471 | null, | |
472 | {"a": {"b": 4}}, | |
473 | {"a": {"b": 5}}, | |
474 | {"a": {"b": 6}} | |
475 | ])"); | |
476 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
477 | } | |
478 | ||
479 | // | |
480 | // Struct encodings (two fields) | |
481 | // | |
482 | ||
483 | TEST_F(TestReconstructColumn, NestedTwoFields1) { | |
484 | // Arrow schema: struct(a: int32 not null, b: int64 not null) not null | |
485 | SetParquetSchema(GroupNode::Make( | |
486 | "parent", Repetition::REQUIRED, | |
487 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32), | |
488 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})); | |
489 | ||
490 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6})); | |
491 | ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9})); | |
492 | ||
493 | auto type = struct_( | |
494 | {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)}); | |
495 | auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": 7}, | |
496 | {"a": 5, "b": 8}, | |
497 | {"a": 6, "b": 9}])"); | |
498 | ||
499 | CheckColumn(/*column_index=*/0, *expected); | |
500 | } | |
501 | ||
502 | TEST_F(TestReconstructColumn, NestedTwoFields2) { | |
503 | // Arrow schema: struct(a: int32 not null, b: int64) not null | |
504 | SetParquetSchema(GroupNode::Make( | |
505 | "parent", Repetition::REQUIRED, | |
506 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32), | |
507 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)})); | |
508 | ||
509 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6})); | |
510 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8})); | |
511 | ||
512 | auto type = struct_({field("a", int32(), /*nullable=*/false), field("b", int64())}); | |
513 | auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": null}, | |
514 | {"a": 5, "b": 7}, | |
515 | {"a": 6, "b": 8}])"); | |
516 | ||
517 | CheckColumn(/*column_index=*/0, *expected); | |
518 | } | |
519 | ||
520 | TEST_F(TestReconstructColumn, NestedTwoFields3) { | |
521 | // Arrow schema: struct(a: int32 not null, b: int64 not null) | |
522 | SetParquetSchema(GroupNode::Make( | |
523 | "parent", Repetition::OPTIONAL, | |
524 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32), | |
525 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})); | |
526 | ||
527 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5})); | |
528 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8})); | |
529 | ||
530 | auto type = struct_( | |
531 | {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)}); | |
532 | auto expected = ArrayFromJSON(type, R"([null, | |
533 | {"a": 4, "b": 7}, | |
534 | {"a": 5, "b": 8}])"); | |
535 | ||
536 | CheckColumn(/*column_index=*/0, *expected); | |
537 | } | |
538 | ||
539 | TEST_F(TestReconstructColumn, NestedTwoFields4) { | |
540 | // Arrow schema: struct(a: int32, b: int64 not null) | |
541 | SetParquetSchema(GroupNode::Make( | |
542 | "parent", Repetition::OPTIONAL, | |
543 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
544 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})); | |
545 | ||
546 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4})); | |
547 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8})); | |
548 | ||
549 | auto type = struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}); | |
550 | auto expected = ArrayFromJSON(type, R"([null, | |
551 | {"a": null, "b": 7}, | |
552 | {"a": 4, "b": 8}])"); | |
553 | ||
554 | CheckColumn(/*column_index=*/0, *expected); | |
555 | } | |
556 | ||
557 | TEST_F(TestReconstructColumn, NestedTwoFields5) { | |
558 | // Arrow schema: struct(a: int32, b: int64) | |
559 | SetParquetSchema(GroupNode::Make( | |
560 | "parent", Repetition::OPTIONAL, | |
561 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
562 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)})); | |
563 | ||
564 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4})); | |
565 | ASSERT_OK(WriteInt64Column(DefLevels{0, 2, 1}, RepLevels{}, Int64Vector{7})); | |
566 | ||
567 | auto type = struct_({field("a", int32()), field("b", int64())}); | |
568 | auto expected = ArrayFromJSON(type, R"([null, | |
569 | {"a": null, "b": 7}, | |
570 | {"a": 4, "b": null}])"); | |
571 | ||
572 | CheckColumn(/*column_index=*/0, *expected); | |
573 | } | |
574 | ||
575 | // | |
576 | // Nested struct encodings (two fields) | |
577 | // | |
578 | ||
579 | TEST_F(TestReconstructColumn, NestedNestedTwoFields1) { | |
580 | // Arrow schema: struct(a: struct(aa: int32 not null, | |
581 | // ab: int64 not null) not null, | |
582 | // b: int32 not null) not null | |
583 | SetParquetSchema(GroupNode::Make( | |
584 | "parent", Repetition::REQUIRED, | |
585 | {GroupNode::Make( | |
586 | "a", Repetition::REQUIRED, | |
587 | {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32), | |
588 | PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}), | |
589 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})); | |
590 | ||
591 | // aa | |
592 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6})); | |
593 | // ab | |
594 | ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9})); | |
595 | // b | |
596 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12})); | |
597 | ||
598 | auto type = struct_({field("a", | |
599 | struct_({field("aa", int32(), /*nullable=*/false), | |
600 | field("ab", int64(), /*nullable=*/false)}), | |
601 | /*nullable=*/false), | |
602 | field("b", int32(), /*nullable=*/false)}); | |
603 | auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10}, | |
604 | {"a": {"aa": 5, "ab": 8}, "b": 11}, | |
605 | {"a": {"aa": 6, "ab": 9}, "b": 12}])"); | |
606 | ||
607 | CheckColumn(/*column_index=*/0, *expected); | |
608 | } | |
609 | ||
610 | TEST_F(TestReconstructColumn, NestedNestedTwoFields2) { | |
611 | // Arrow schema: struct(a: struct(aa: int32, | |
612 | // ab: int64 not null) not null, | |
613 | // b: int32 not null) not null | |
614 | SetParquetSchema(GroupNode::Make( | |
615 | "parent", Repetition::REQUIRED, | |
616 | {GroupNode::Make( | |
617 | "a", Repetition::REQUIRED, | |
618 | {PrimitiveNode::Make("aa", Repetition::OPTIONAL, ParquetType::INT32), | |
619 | PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}), | |
620 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})); | |
621 | ||
622 | // aa | |
623 | ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{4, 5})); | |
624 | // ab | |
625 | ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9})); | |
626 | // b | |
627 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12})); | |
628 | ||
629 | auto type = struct_( | |
630 | {field("a", | |
631 | struct_({field("aa", int32()), field("ab", int64(), /*nullable=*/false)}), | |
632 | /*nullable=*/false), | |
633 | field("b", int32(), /*nullable=*/false)}); | |
634 | auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10}, | |
635 | {"a": {"aa": null, "ab": 8}, "b": 11}, | |
636 | {"a": {"aa": 5, "ab": 9}, "b": 12}])"); | |
637 | ||
638 | CheckColumn(/*column_index=*/0, *expected); | |
639 | } | |
640 | ||
641 | TEST_F(TestReconstructColumn, NestedNestedTwoFields3) { | |
642 | // Arrow schema: struct(a: struct(aa: int32 not null, | |
643 | // ab: int64) not null, | |
644 | // b: int32) not null | |
645 | SetParquetSchema(GroupNode::Make( | |
646 | "parent", Repetition::REQUIRED, | |
647 | {GroupNode::Make( | |
648 | "a", Repetition::REQUIRED, | |
649 | {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32), | |
650 | PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}), | |
651 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})); | |
652 | ||
653 | // aa | |
654 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6})); | |
655 | // ab | |
656 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8})); | |
657 | // b | |
658 | ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{10, 11})); | |
659 | ||
660 | auto type = struct_( | |
661 | {field("a", | |
662 | struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}), | |
663 | /*nullable=*/false), | |
664 | field("b", int32())}); | |
665 | auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": null}, "b": 10}, | |
666 | {"a": {"aa": 5, "ab": 7}, "b": null}, | |
667 | {"a": {"aa": 6, "ab": 8}, "b": 11}])"); | |
668 | ||
669 | CheckColumn(/*column_index=*/0, *expected); | |
670 | } | |
671 | ||
672 | TEST_F(TestReconstructColumn, NestedNestedTwoFields4) { | |
673 | // Arrow schema: struct(a: struct(aa: int32 not null, | |
674 | // ab: int64), | |
675 | // b: int32 not null) not null | |
676 | SetParquetSchema(GroupNode::Make( | |
677 | "parent", Repetition::REQUIRED, | |
678 | {GroupNode::Make( | |
679 | "a", Repetition::OPTIONAL, | |
680 | {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32), | |
681 | PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}), | |
682 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})); | |
683 | ||
684 | // aa | |
685 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5})); | |
686 | // ab | |
687 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7})); | |
688 | // b | |
689 | ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12})); | |
690 | ||
691 | auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false), | |
692 | field("ab", int64())})), | |
693 | field("b", int32(), /*nullable=*/false)}); | |
694 | auto expected = ArrayFromJSON(type, R"([{"a": null, "b": 10}, | |
695 | {"a": {"aa": 4, "ab": null}, "b": 11}, | |
696 | {"a": {"aa": 5, "ab": 7}, "b": 12}])"); | |
697 | ||
698 | CheckColumn(/*column_index=*/0, *expected); | |
699 | } | |
700 | ||
701 | TEST_F(TestReconstructColumn, NestedNestedTwoFields5) { | |
702 | // Arrow schema: struct(a: struct(aa: int32 not null, | |
703 | // ab: int64) not null, | |
704 | // b: int32) | |
705 | SetParquetSchema(GroupNode::Make( | |
706 | "parent", Repetition::OPTIONAL, | |
707 | {GroupNode::Make( | |
708 | "a", Repetition::REQUIRED, | |
709 | {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32), | |
710 | PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}), | |
711 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})); | |
712 | ||
713 | // aa | |
714 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5})); | |
715 | // ab | |
716 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7})); | |
717 | // b | |
718 | ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1}, RepLevels{}, Int32Vector{10})); | |
719 | ||
720 | auto type = struct_( | |
721 | {field("a", | |
722 | struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}), | |
723 | /*nullable=*/false), | |
724 | field("b", int32())}); | |
725 | auto expected = ArrayFromJSON(type, R"([null, | |
726 | {"a": {"aa": 4, "ab": null}, "b": 10}, | |
727 | {"a": {"aa": 5, "ab": 7}, "b": null}])"); | |
728 | ||
729 | CheckColumn(/*column_index=*/0, *expected); | |
730 | } | |
731 | ||
732 | TEST_F(TestReconstructColumn, NestedNestedTwoFields6) { | |
733 | // Arrow schema: struct(a: struct(aa: int32 not null, | |
734 | // ab: int64), | |
735 | // b: int32) | |
736 | SetParquetSchema(GroupNode::Make( | |
737 | "parent", Repetition::OPTIONAL, | |
738 | {GroupNode::Make( | |
739 | "a", Repetition::OPTIONAL, | |
740 | {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32), | |
741 | PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}), | |
742 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})); | |
743 | ||
744 | // aa | |
745 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2}, RepLevels{}, Int32Vector{4, 5})); | |
746 | // ab | |
747 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 3}, RepLevels{}, Int64Vector{7})); | |
748 | // b | |
749 | ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{}, Int32Vector{10, 11})); | |
750 | ||
751 | auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false), | |
752 | field("ab", int64())})), | |
753 | field("b", int32())}); | |
754 | auto expected = ArrayFromJSON(type, R"([null, | |
755 | {"a": null, "b": 10}, | |
756 | {"a": {"aa": 4, "ab": null}, "b": null}, | |
757 | {"a": {"aa": 5, "ab": 7}, "b": 11}])"); | |
758 | ||
759 | CheckColumn(/*column_index=*/0, *expected); | |
760 | } | |
761 | ||
762 | // | |
763 | // Three-level list encodings | |
764 | // | |
765 | ||
766 | TEST_F(TestReconstructColumn, ThreeLevelListRequiredRequired) { | |
767 | // Arrow schema: list(int32 not null) not null | |
768 | SetParquetSchema(GroupNode::Make( | |
769 | "parent", Repetition::REQUIRED, | |
770 | {GroupNode::Make( | |
771 | "list", Repetition::REPEATED, | |
772 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
773 | LogicalType::List())); | |
774 | ||
775 | LevelVector def_levels = {0, 1, 1, 1}; | |
776 | LevelVector rep_levels = {0, 0, 1, 0}; | |
777 | std::vector<int32_t> values = {4, 5, 6}; | |
778 | ||
779 | // TODO should field name "element" (Parquet convention for List nodes) | |
780 | // be changed to "item" (Arrow convention for List types)? | |
781 | auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]"); | |
782 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
783 | } | |
784 | ||
785 | TEST_F(TestReconstructColumn, ThreeLevelListOptionalRequired) { | |
786 | // Arrow schema: list(int32 not null) | |
787 | SetParquetSchema(GroupNode::Make( | |
788 | "parent", Repetition::OPTIONAL, | |
789 | {GroupNode::Make( | |
790 | "list", Repetition::REPEATED, | |
791 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
792 | LogicalType::List())); | |
793 | ||
794 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
795 | LevelVector rep_levels = {0, 0, 0, 1, 0}; | |
796 | std::vector<int32_t> values = {4, 5, 6}; | |
797 | ||
798 | auto expected = | |
799 | ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]"); | |
800 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
801 | } | |
802 | ||
803 | TEST_F(TestReconstructColumn, ThreeLevelListRequiredOptional) { | |
804 | // Arrow schema: list(int32) not null | |
805 | SetParquetSchema(GroupNode::Make( | |
806 | "parent", Repetition::REQUIRED, | |
807 | {GroupNode::Make( | |
808 | "list", Repetition::REPEATED, | |
809 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})}, | |
810 | LogicalType::List())); | |
811 | ||
812 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
813 | LevelVector rep_levels = {0, 0, 1, 0, 1}; | |
814 | std::vector<int32_t> values = {4, 5, 6}; | |
815 | ||
816 | auto expected = ArrayFromJSON(List(int32()), "[[], [null, 4], [5, 6]]"); | |
817 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
818 | } | |
819 | ||
820 | TEST_F(TestReconstructColumn, ThreeLevelListOptionalOptional) { | |
821 | // Arrow schema: list(int32) | |
822 | SetParquetSchema(GroupNode::Make( | |
823 | "parent", Repetition::OPTIONAL, | |
824 | {GroupNode::Make( | |
825 | "list", Repetition::REPEATED, | |
826 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})}, | |
827 | LogicalType::List())); | |
828 | ||
829 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
830 | LevelVector rep_levels = {0, 0, 0, 1, 0, 1}; | |
831 | std::vector<int32_t> values = {4, 5, 6}; | |
832 | ||
833 | auto expected = ArrayFromJSON(List(int32()), "[null, [], [null, 4], [5, 6]]"); | |
834 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
835 | } | |
836 | ||
837 | // | |
838 | // Legacy list encodings | |
839 | // | |
840 | ||
841 | TEST_F(TestReconstructColumn, TwoLevelListRequired) { | |
842 | // Arrow schema: list(int32 not null) not null | |
843 | SetParquetSchema(GroupNode::Make( | |
844 | "parent", Repetition::REQUIRED, | |
845 | {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)}, | |
846 | LogicalType::List())); | |
847 | ||
848 | LevelVector def_levels = {0, 1, 1, 1}; | |
849 | LevelVector rep_levels = {0, 0, 1, 0}; | |
850 | std::vector<int32_t> values = {4, 5, 6}; | |
851 | ||
852 | // TODO should field name "element" (Parquet convention for List nodes) | |
853 | // be changed to "item" (Arrow convention for List types)? | |
854 | auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]"); | |
855 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
856 | } | |
857 | ||
858 | TEST_F(TestReconstructColumn, TwoLevelListOptional) { | |
859 | // Arrow schema: list(int32 not null) | |
860 | SetParquetSchema(GroupNode::Make( | |
861 | "parent", Repetition::OPTIONAL, | |
862 | {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)}, | |
863 | LogicalType::List())); | |
864 | ||
865 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
866 | LevelVector rep_levels = {0, 0, 0, 1, 0}; | |
867 | std::vector<int32_t> values = {4, 5, 6}; | |
868 | ||
869 | auto expected = | |
870 | ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]"); | |
871 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
872 | } | |
873 | ||
874 | // | |
875 | // List-in-struct | |
876 | // | |
877 | ||
878 | TEST_F(TestReconstructColumn, NestedList1) { | |
879 | // Arrow schema: struct(a: list(int32 not null) not null) not null | |
880 | SetParquetSchema(GroupNode::Make( | |
881 | "a", Repetition::REQUIRED, | |
882 | {GroupNode::Make( | |
883 | "p", Repetition::REQUIRED, | |
884 | {GroupNode::Make("list", Repetition::REPEATED, | |
885 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
886 | ParquetType::INT32)})}, | |
887 | LogicalType::List())})); | |
888 | ||
889 | LevelVector def_levels = {0, 1, 1, 1}; | |
890 | LevelVector rep_levels = {0, 0, 1, 0}; | |
891 | std::vector<int32_t> values = {4, 5, 6}; | |
892 | ||
893 | auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false), | |
894 | /*nullable=*/false); | |
895 | auto expected = ArrayFromJSON(type, R"([{"p": []}, | |
896 | {"p": [4, 5]}, | |
897 | {"p": [6]}])"); | |
898 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
899 | } | |
900 | ||
901 | TEST_F(TestReconstructColumn, NestedList2) { | |
902 | // Arrow schema: struct(a: list(int32 not null) not null) | |
903 | SetParquetSchema(GroupNode::Make( | |
904 | "a", Repetition::OPTIONAL, | |
905 | {GroupNode::Make( | |
906 | "p", Repetition::REQUIRED, | |
907 | {GroupNode::Make("list", Repetition::REPEATED, | |
908 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
909 | ParquetType::INT32)})}, | |
910 | LogicalType::List())})); | |
911 | ||
912 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
913 | LevelVector rep_levels = {0, 0, 0, 1, 0}; | |
914 | std::vector<int32_t> values = {4, 5, 6}; | |
915 | ||
916 | auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false), | |
917 | /*nullable=*/false); | |
918 | auto expected = ArrayFromJSON(type, R"([null, | |
919 | {"p": []}, | |
920 | {"p": [4, 5]}, | |
921 | {"p": [6]}])"); | |
922 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
923 | } | |
924 | ||
925 | TEST_F(TestReconstructColumn, NestedList3) { | |
926 | // Arrow schema: struct(a: list(int32 not null)) not null | |
927 | SetParquetSchema(GroupNode::Make( | |
928 | "a", Repetition::REQUIRED, // column name (column a is a struct of) | |
929 | {GroupNode::Make( | |
930 | "p", Repetition::OPTIONAL, // name in struct | |
931 | {GroupNode::Make("list", Repetition::REPEATED, | |
932 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
933 | ParquetType::INT32)})}, | |
934 | LogicalType::List())})); | |
935 | ||
936 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
937 | LevelVector rep_levels = {0, 0, 0, 1, 0}; | |
938 | std::vector<int32_t> values = {4, 5, 6}; | |
939 | ||
940 | auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false)); | |
941 | auto expected = ArrayFromJSON(type, R"([{"p": null}, | |
942 | {"p": []}, | |
943 | {"p": [4, 5]}, | |
944 | {"p": [6]}])"); | |
945 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
946 | } | |
947 | ||
948 | TEST_F(TestReconstructColumn, NestedList4) { | |
949 | // Arrow schema: struct(a: list(int32 not null)) | |
950 | SetParquetSchema(GroupNode::Make( | |
951 | "a", Repetition::OPTIONAL, | |
952 | {GroupNode::Make( | |
953 | "p", Repetition::OPTIONAL, | |
954 | {GroupNode::Make("list", Repetition::REPEATED, | |
955 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
956 | ParquetType::INT32)})}, | |
957 | LogicalType::List())})); | |
958 | ||
959 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
960 | LevelVector rep_levels = {0, 0, 0, 0, 1, 0}; | |
961 | std::vector<int32_t> values = {4, 5, 6}; | |
962 | ||
963 | auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false)); | |
964 | auto expected = ArrayFromJSON(type, R"([null, | |
965 | {"p": null}, | |
966 | {"p": []}, | |
967 | {"p": [4, 5]}, | |
968 | {"p": [6]}])"); | |
969 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
970 | } | |
971 | ||
972 | TEST_F(TestReconstructColumn, NestedList5) { | |
973 | // Arrow schema: struct(a: list(int32) not null) | |
974 | SetParquetSchema(GroupNode::Make( | |
975 | "a", Repetition::OPTIONAL, | |
976 | {GroupNode::Make( | |
977 | "p", Repetition::REQUIRED, | |
978 | {GroupNode::Make("list", Repetition::REPEATED, | |
979 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, | |
980 | ParquetType::INT32)})}, | |
981 | LogicalType::List())})); | |
982 | ||
983 | LevelVector def_levels = {0, 1, 3, 2, 3, 3}; | |
984 | LevelVector rep_levels = {0, 0, 0, 1, 0, 1}; | |
985 | std::vector<int32_t> values = {4, 5, 6}; | |
986 | ||
987 | auto type = OneFieldStruct("p", List(int32()), /*nullable=*/false); | |
988 | auto expected = ArrayFromJSON(type, R"([null, | |
989 | {"p": []}, | |
990 | {"p": [4, null]}, | |
991 | {"p": [5, 6]}])"); | |
992 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
993 | } | |
994 | ||
995 | TEST_F(TestReconstructColumn, NestedList6) { | |
996 | // Arrow schema: struct(a: list(int32)) | |
997 | SetParquetSchema(GroupNode::Make( | |
998 | "a", Repetition::OPTIONAL, | |
999 | {GroupNode::Make( | |
1000 | "p", Repetition::OPTIONAL, | |
1001 | {GroupNode::Make("list", Repetition::REPEATED, | |
1002 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, | |
1003 | ParquetType::INT32)})}, | |
1004 | LogicalType::List())})); | |
1005 | ||
1006 | LevelVector def_levels = {0, 1, 2, 4, 3, 4, 4}; | |
1007 | LevelVector rep_levels = {0, 0, 0, 0, 1, 0, 1}; | |
1008 | std::vector<int32_t> values = {4, 5, 6}; | |
1009 | ||
1010 | auto type = OneFieldStruct("p", List(int32())); | |
1011 | auto expected = ArrayFromJSON(type, R"([null, | |
1012 | {"p": null}, | |
1013 | {"p": []}, | |
1014 | {"p": [4, null]}, | |
1015 | {"p": [5, 6]}])"); | |
1016 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1017 | } | |
1018 | ||
1019 | // | |
1020 | // Struct-in-list | |
1021 | // | |
1022 | ||
1023 | TEST_F(TestReconstructColumn, ListNested1) { | |
1024 | // Arrow schema: list(struct(a: int32 not null) not null) not null | |
1025 | SetParquetSchema(GroupNode::Make( | |
1026 | "parent", Repetition::REQUIRED, | |
1027 | {GroupNode::Make("list", Repetition::REPEATED, | |
1028 | {GroupNode::Make("element", Repetition::REQUIRED, | |
1029 | {PrimitiveNode::Make("a", Repetition::REQUIRED, | |
1030 | ParquetType::INT32)})})}, | |
1031 | LogicalType::List())); | |
1032 | ||
1033 | LevelVector def_levels = {0, 1, 1, 1}; | |
1034 | LevelVector rep_levels = {0, 0, 1, 0}; | |
1035 | std::vector<int32_t> values = {4, 5, 6}; | |
1036 | ||
1037 | auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false), | |
1038 | /*nullable=*/false); | |
1039 | auto expected = ArrayFromJSON(type, | |
1040 | R"([[], | |
1041 | [{"a": 4}, {"a": 5}], | |
1042 | [{"a": 6}]])"); | |
1043 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1044 | } | |
1045 | ||
1046 | TEST_F(TestReconstructColumn, ListNested2) { | |
1047 | // Arrow schema: list(struct(a: int32 not null) not null) | |
1048 | SetParquetSchema(GroupNode::Make( | |
1049 | "parent", Repetition::OPTIONAL, | |
1050 | {GroupNode::Make("list", Repetition::REPEATED, | |
1051 | {GroupNode::Make("element", Repetition::REQUIRED, | |
1052 | {PrimitiveNode::Make("a", Repetition::REQUIRED, | |
1053 | ParquetType::INT32)})})}, | |
1054 | LogicalType::List())); | |
1055 | ||
1056 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
1057 | LevelVector rep_levels = {0, 0, 0, 1, 0}; | |
1058 | std::vector<int32_t> values = {4, 5, 6}; | |
1059 | ||
1060 | auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false), | |
1061 | /*nullable=*/false); | |
1062 | auto expected = ArrayFromJSON(type, | |
1063 | R"([null, | |
1064 | [], | |
1065 | [{"a": 4}, {"a": 5}], | |
1066 | [{"a": 6}]])"); | |
1067 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1068 | } | |
1069 | ||
1070 | TEST_F(TestReconstructColumn, ListNested3) { | |
1071 | // Arrow schema: list(struct(a: int32 not null)) not null | |
1072 | SetParquetSchema(GroupNode::Make( | |
1073 | "parent", Repetition::REQUIRED, | |
1074 | {GroupNode::Make("list", Repetition::REPEATED, | |
1075 | {GroupNode::Make("element", Repetition::OPTIONAL, | |
1076 | {PrimitiveNode::Make("a", Repetition::REQUIRED, | |
1077 | ParquetType::INT32)})})}, | |
1078 | LogicalType::List())); | |
1079 | ||
1080 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
1081 | LevelVector rep_levels = {0, 0, 1, 1, 0}; | |
1082 | std::vector<int32_t> values = {4, 5, 6}; | |
1083 | ||
1084 | auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false)); | |
1085 | auto expected = ArrayFromJSON(type, | |
1086 | R"([[], | |
1087 | [null, {"a": 4}, {"a": 5}], | |
1088 | [{"a": 6}]])"); | |
1089 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1090 | } | |
1091 | ||
1092 | TEST_F(TestReconstructColumn, ListNested4) { | |
1093 | // Arrow schema: list(struct(a: int32 not null)) | |
1094 | SetParquetSchema(GroupNode::Make( | |
1095 | "parent", Repetition::OPTIONAL, | |
1096 | {GroupNode::Make("list", Repetition::REPEATED, | |
1097 | {GroupNode::Make("element", Repetition::OPTIONAL, | |
1098 | {PrimitiveNode::Make("a", Repetition::REQUIRED, | |
1099 | ParquetType::INT32)})})}, | |
1100 | LogicalType::List())); | |
1101 | ||
1102 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
1103 | LevelVector rep_levels = {0, 0, 0, 1, 1, 0}; | |
1104 | std::vector<int32_t> values = {4, 5, 6}; | |
1105 | ||
1106 | auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false)); | |
1107 | auto expected = ArrayFromJSON(type, | |
1108 | R"([null, | |
1109 | [], | |
1110 | [null, {"a": 4}, {"a": 5}], | |
1111 | [{"a": 6}]])"); | |
1112 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1113 | } | |
1114 | ||
1115 | TEST_F(TestReconstructColumn, ListNested5) { | |
1116 | // Arrow schema: list(struct(a: int32) not null) | |
1117 | SetParquetSchema(GroupNode::Make( | |
1118 | "parent", Repetition::OPTIONAL, | |
1119 | {GroupNode::Make("list", Repetition::REPEATED, | |
1120 | {GroupNode::Make("element", Repetition::REQUIRED, | |
1121 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, | |
1122 | ParquetType::INT32)})})}, | |
1123 | LogicalType::List())); | |
1124 | ||
1125 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
1126 | LevelVector rep_levels = {0, 0, 0, 1, 0, 1}; | |
1127 | std::vector<int32_t> values = {4, 5, 6}; | |
1128 | ||
1129 | auto type = List(OneFieldStruct("a", int32()), | |
1130 | /*nullable=*/false); | |
1131 | auto expected = ArrayFromJSON(type, | |
1132 | R"([null, | |
1133 | [], | |
1134 | [{"a": null}, {"a": 4}], | |
1135 | [{"a": 5}, {"a": 6}]])"); | |
1136 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1137 | } | |
1138 | ||
1139 | TEST_F(TestReconstructColumn, ListNested6) { | |
1140 | // Arrow schema: list(struct(a: int32)) | |
1141 | SetParquetSchema(GroupNode::Make( | |
1142 | "parent", Repetition::OPTIONAL, | |
1143 | {GroupNode::Make("list", Repetition::REPEATED, | |
1144 | {GroupNode::Make("element", Repetition::OPTIONAL, | |
1145 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, | |
1146 | ParquetType::INT32)})})}, | |
1147 | LogicalType::List())); | |
1148 | ||
1149 | LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4}; | |
1150 | LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 1}; | |
1151 | std::vector<int32_t> values = {4, 5, 6}; | |
1152 | ||
1153 | auto type = List(OneFieldStruct("a", int32())); | |
1154 | auto expected = ArrayFromJSON(type, | |
1155 | R"([null, | |
1156 | [], | |
1157 | [null, {"a": null}, {"a": 4}], | |
1158 | [{"a": 5}, {"a": 6}]])"); | |
1159 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1160 | } | |
1161 | ||
1162 | // | |
1163 | // Struct (two fields)-in-list | |
1164 | // | |
1165 | ||
1166 | TEST_F(TestReconstructColumn, ListNestedTwoFields1) { | |
1167 | // Arrow schema: list(struct(a: int32 not null, | |
1168 | // b: int64 not null) not null) not null | |
1169 | SetParquetSchema(GroupNode::Make( | |
1170 | "parent", Repetition::REQUIRED, | |
1171 | {GroupNode::Make( | |
1172 | "list", Repetition::REPEATED, | |
1173 | {GroupNode::Make( | |
1174 | "element", Repetition::REQUIRED, | |
1175 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32), | |
1176 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})}, | |
1177 | LogicalType::List())); | |
1178 | ||
1179 | // a | |
1180 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0}, | |
1181 | Int32Vector{4, 5, 6})); | |
1182 | // b | |
1183 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0}, | |
1184 | Int64Vector{7, 8, 9})); | |
1185 | ||
1186 | auto type = List(struct_({field("a", int32(), /*nullable=*/false), | |
1187 | field("b", int64(), /*nullable=*/false)}), | |
1188 | /*nullable=*/false); | |
1189 | auto expected = ArrayFromJSON(type, | |
1190 | R"([[], | |
1191 | [{"a": 4, "b": 7}, {"a": 5, "b": 8}], | |
1192 | [{"a": 6, "b": 9}]])"); | |
1193 | CheckColumn(/*column_index=*/0, *expected); | |
1194 | } | |
1195 | ||
1196 | TEST_F(TestReconstructColumn, ListNestedTwoFields2) { | |
1197 | // Arrow schema: list(struct(a: int32, | |
1198 | // b: int64 not null) not null) not null | |
1199 | SetParquetSchema(GroupNode::Make( | |
1200 | "parent", Repetition::REQUIRED, | |
1201 | {GroupNode::Make( | |
1202 | "list", Repetition::REPEATED, | |
1203 | {GroupNode::Make( | |
1204 | "element", Repetition::REQUIRED, | |
1205 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
1206 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})}, | |
1207 | LogicalType::List())); | |
1208 | ||
1209 | // a | |
1210 | ASSERT_OK( | |
1211 | WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{0, 0, 1, 0}, Int32Vector{4, 5})); | |
1212 | // b | |
1213 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0}, | |
1214 | Int64Vector{7, 8, 9})); | |
1215 | ||
1216 | auto type = | |
1217 | List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}), | |
1218 | /*nullable=*/false); | |
1219 | auto expected = ArrayFromJSON(type, | |
1220 | R"([[], | |
1221 | [{"a": 4, "b": 7}, {"a": null, "b": 8}], | |
1222 | [{"a": 5, "b": 9}]])"); | |
1223 | CheckColumn(/*column_index=*/0, *expected); | |
1224 | } | |
1225 | ||
1226 | TEST_F(TestReconstructColumn, ListNestedTwoFields3) { | |
1227 | // Arrow schema: list(struct(a: int32 not null, | |
1228 | // b: int64 not null)) not null | |
1229 | SetParquetSchema(GroupNode::Make( | |
1230 | "parent", Repetition::REQUIRED, | |
1231 | {GroupNode::Make( | |
1232 | "list", Repetition::REPEATED, | |
1233 | {GroupNode::Make( | |
1234 | "element", Repetition::OPTIONAL, | |
1235 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32), | |
1236 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})}, | |
1237 | LogicalType::List())); | |
1238 | ||
1239 | // a | |
1240 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0}, | |
1241 | Int32Vector{4, 5, 6})); | |
1242 | // b | |
1243 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0}, | |
1244 | Int64Vector{7, 8, 9})); | |
1245 | ||
1246 | auto type = List(struct_({field("a", int32(), /*nullable=*/false), | |
1247 | field("b", int64(), /*nullable=*/false)})); | |
1248 | auto expected = ArrayFromJSON(type, | |
1249 | R"([[], | |
1250 | [null, {"a": 4, "b": 7}, {"a": 5, "b": 8}], | |
1251 | [{"a": 6, "b": 9}]])"); | |
1252 | CheckColumn(/*column_index=*/0, *expected); | |
1253 | } | |
1254 | ||
1255 | TEST_F(TestReconstructColumn, ListNestedTwoFields4) { | |
1256 | // Arrow schema: list(struct(a: int32, | |
1257 | // b: int64 not null) not null) | |
1258 | SetParquetSchema(GroupNode::Make( | |
1259 | "parent", Repetition::OPTIONAL, | |
1260 | {GroupNode::Make( | |
1261 | "list", Repetition::REPEATED, | |
1262 | {GroupNode::Make( | |
1263 | "element", Repetition::REQUIRED, | |
1264 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
1265 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})}, | |
1266 | LogicalType::List())); | |
1267 | ||
1268 | // a | |
1269 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0}, | |
1270 | Int32Vector{4, 5})); | |
1271 | // b | |
1272 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0}, | |
1273 | Int64Vector{7, 8, 9})); | |
1274 | ||
1275 | auto type = | |
1276 | List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}), | |
1277 | /*nullable=*/false); | |
1278 | auto expected = ArrayFromJSON(type, | |
1279 | R"([null, | |
1280 | [], | |
1281 | [{"a": 4, "b": 7}, {"a": null, "b": 8}], | |
1282 | [{"a": 5, "b": 9}]])"); | |
1283 | CheckColumn(/*column_index=*/0, *expected); | |
1284 | } | |
1285 | ||
1286 | TEST_F(TestReconstructColumn, ListNestedTwoFields5) { | |
1287 | // Arrow schema: list(struct(a: int32, | |
1288 | // b: int64 not null)) | |
1289 | SetParquetSchema(GroupNode::Make( | |
1290 | "parent", Repetition::OPTIONAL, | |
1291 | {GroupNode::Make( | |
1292 | "list", Repetition::REPEATED, | |
1293 | {GroupNode::Make( | |
1294 | "element", Repetition::OPTIONAL, | |
1295 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
1296 | PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})}, | |
1297 | LogicalType::List())); | |
1298 | ||
1299 | // a | |
1300 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0}, | |
1301 | Int32Vector{4})); | |
1302 | // b | |
1303 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0}, | |
1304 | Int64Vector{7, 8})); | |
1305 | ||
1306 | auto type = | |
1307 | List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)})); | |
1308 | auto expected = ArrayFromJSON(type, | |
1309 | R"([null, | |
1310 | [], | |
1311 | [{"a": 4, "b": 7}, null], | |
1312 | [{"a": null, "b": 8}]])"); | |
1313 | CheckColumn(/*column_index=*/0, *expected); | |
1314 | } | |
1315 | ||
1316 | TEST_F(TestReconstructColumn, ListNestedTwoFields6) { | |
1317 | // Arrow schema: list(struct(a: int32, | |
1318 | // b: int64)) | |
1319 | SetParquetSchema(GroupNode::Make( | |
1320 | "parent", Repetition::OPTIONAL, | |
1321 | {GroupNode::Make( | |
1322 | "list", Repetition::REPEATED, | |
1323 | {GroupNode::Make( | |
1324 | "element", Repetition::OPTIONAL, | |
1325 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32), | |
1326 | PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)})})}, | |
1327 | LogicalType::List())); | |
1328 | ||
1329 | // a | |
1330 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0}, | |
1331 | Int32Vector{4})); | |
1332 | // b | |
1333 | ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 4}, RepLevels{0, 0, 0, 1, 0}, | |
1334 | Int64Vector{7})); | |
1335 | ||
1336 | auto type = List(struct_({field("a", int32()), field("b", int64())})); | |
1337 | auto expected = ArrayFromJSON(type, | |
1338 | R"([null, | |
1339 | [], | |
1340 | [{"a": 4, "b": null}, null], | |
1341 | [{"a": null, "b": 7}]])"); | |
1342 | CheckColumn(/*column_index=*/0, *expected); | |
1343 | } | |
1344 | ||
1345 | // | |
1346 | // List-in-struct (two fields) | |
1347 | // | |
1348 | ||
1349 | TEST_F(TestReconstructColumn, NestedTwoFieldsList1) { | |
1350 | // Arrow schema: struct(a: int64 not null, | |
1351 | // b: list(int32 not null) not null | |
1352 | // ) not null | |
1353 | SetParquetSchema(GroupNode::Make( | |
1354 | "parent", Repetition::REQUIRED, | |
1355 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64), | |
1356 | GroupNode::Make( | |
1357 | "b", Repetition::REQUIRED, | |
1358 | {GroupNode::Make("list", Repetition::REPEATED, | |
1359 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
1360 | ParquetType::INT32)})}, | |
1361 | LogicalType::List())})); | |
1362 | ||
1363 | // a | |
1364 | ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{4, 5, 6})); | |
1365 | // b | |
1366 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0}, | |
1367 | Int32Vector{7, 8, 9})); | |
1368 | ||
1369 | auto type = | |
1370 | struct_({field("a", int64(), /*nullable=*/false), | |
1371 | field("b", List(int32(), /*nullable=*/false), /*nullable=*/false)}); | |
1372 | auto expected = ArrayFromJSON(type, | |
1373 | R"([{"a": 4, "b": []}, | |
1374 | {"a": 5, "b": [7, 8]}, | |
1375 | {"a": 6, "b": [9]}])"); | |
1376 | CheckColumn(/*column_index=*/0, *expected); | |
1377 | } | |
1378 | ||
1379 | TEST_F(TestReconstructColumn, NestedTwoFieldsList2) { | |
1380 | // Arrow schema: struct(a: int64 not null, | |
1381 | // b: list(int32 not null) | |
1382 | // ) not null | |
1383 | SetParquetSchema(GroupNode::Make( | |
1384 | "parent", Repetition::REQUIRED, | |
1385 | {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64), | |
1386 | GroupNode::Make( | |
1387 | "b", Repetition::OPTIONAL, | |
1388 | {GroupNode::Make("list", Repetition::REPEATED, | |
1389 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
1390 | ParquetType::INT32)})}, | |
1391 | LogicalType::List())})); | |
1392 | ||
1393 | // a | |
1394 | ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{3, 4, 5, 6})); | |
1395 | // b | |
1396 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0}, | |
1397 | Int32Vector{7, 8, 9})); | |
1398 | ||
1399 | auto type = struct_({field("a", int64(), /*nullable=*/false), | |
1400 | field("b", List(int32(), /*nullable=*/false))}); | |
1401 | auto expected = ArrayFromJSON(type, | |
1402 | R"([{"a": 3, "b": null}, | |
1403 | {"a": 4, "b": []}, | |
1404 | {"a": 5, "b": [7, 8]}, | |
1405 | {"a": 6, "b": [9]}])"); | |
1406 | CheckColumn(/*column_index=*/0, *expected); | |
1407 | } | |
1408 | ||
1409 | TEST_F(TestReconstructColumn, NestedTwoFieldsList3) { | |
1410 | // Arrow schema: struct(a: int64, | |
1411 | // b: list(int32 not null) | |
1412 | // ) not null | |
1413 | SetParquetSchema(GroupNode::Make( | |
1414 | "parent", Repetition::REQUIRED, | |
1415 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64), | |
1416 | GroupNode::Make( | |
1417 | "b", Repetition::OPTIONAL, | |
1418 | {GroupNode::Make("list", Repetition::REPEATED, | |
1419 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
1420 | ParquetType::INT32)})}, | |
1421 | LogicalType::List())})); | |
1422 | ||
1423 | // a | |
1424 | ASSERT_OK(WriteInt64Column(DefLevels{1, 1, 0, 1}, RepLevels{}, Int64Vector{4, 5, 6})); | |
1425 | // b | |
1426 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0}, | |
1427 | Int32Vector{7, 8, 9})); | |
1428 | ||
1429 | auto type = | |
1430 | struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))}); | |
1431 | auto expected = ArrayFromJSON(type, | |
1432 | R"([{"a": 4, "b": null}, | |
1433 | {"a": 5, "b": []}, | |
1434 | {"a": null, "b": [7, 8]}, | |
1435 | {"a": 6, "b": [9]}])"); | |
1436 | CheckColumn(/*column_index=*/0, *expected); | |
1437 | } | |
1438 | ||
1439 | TEST_F(TestReconstructColumn, NestedTwoFieldsList4) { | |
1440 | // Arrow schema: struct(a: int64, | |
1441 | // b: list(int32 not null) | |
1442 | // ) | |
1443 | SetParquetSchema(GroupNode::Make( | |
1444 | "parent", Repetition::OPTIONAL, | |
1445 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64), | |
1446 | GroupNode::Make( | |
1447 | "b", Repetition::OPTIONAL, | |
1448 | {GroupNode::Make("list", Repetition::REPEATED, | |
1449 | {PrimitiveNode::Make("element", Repetition::REQUIRED, | |
1450 | ParquetType::INT32)})}, | |
1451 | LogicalType::List())})); | |
1452 | ||
1453 | // a | |
1454 | ASSERT_OK( | |
1455 | WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6})); | |
1456 | // b | |
1457 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 3, 3, 3}, RepLevels{0, 0, 0, 0, 1, 0}, | |
1458 | Int32Vector{7, 8, 9})); | |
1459 | ||
1460 | auto type = | |
1461 | struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))}); | |
1462 | auto expected = ArrayFromJSON(type, | |
1463 | R"([null, | |
1464 | {"a": 4, "b": null}, | |
1465 | {"a": 5, "b": []}, | |
1466 | {"a": null, "b": [7, 8]}, | |
1467 | {"a": 6, "b": [9]}])"); | |
1468 | CheckColumn(/*column_index=*/0, *expected); | |
1469 | } | |
1470 | ||
1471 | TEST_F(TestReconstructColumn, NestedTwoFieldsList5) { | |
1472 | // Arrow schema: struct(a: int64, b: list(int32)) | |
1473 | SetParquetSchema(GroupNode::Make( | |
1474 | "parent", Repetition::OPTIONAL, | |
1475 | {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64), | |
1476 | GroupNode::Make( | |
1477 | "b", Repetition::OPTIONAL, | |
1478 | {GroupNode::Make("list", Repetition::REPEATED, | |
1479 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, | |
1480 | ParquetType::INT32)})}, | |
1481 | LogicalType::List())})); | |
1482 | ||
1483 | // a | |
1484 | ASSERT_OK( | |
1485 | WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6})); | |
1486 | // b | |
1487 | ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 4, 3, 4}, RepLevels{0, 0, 0, 0, 1, 0}, | |
1488 | Int32Vector{7, 8})); | |
1489 | ||
1490 | auto type = struct_({field("a", int64()), field("b", List(int32()))}); | |
1491 | auto expected = ArrayFromJSON(type, | |
1492 | R"([null, | |
1493 | {"a": 4, "b": null}, | |
1494 | {"a": 5, "b": []}, | |
1495 | {"a": null, "b": [7, null]}, | |
1496 | {"a": 6, "b": [8]}])"); | |
1497 | CheckColumn(/*column_index=*/0, *expected); | |
1498 | } | |
1499 | ||
1500 | // | |
1501 | // List-in-list | |
1502 | // | |
1503 | ||
1504 | TEST_F(TestReconstructColumn, ListList1) { | |
1505 | // Arrow schema: list(list(int32 not null) not null) not null | |
1506 | auto inner_list = GroupNode::Make( | |
1507 | "element", Repetition::REQUIRED, | |
1508 | {GroupNode::Make( | |
1509 | "list", Repetition::REPEATED, | |
1510 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
1511 | LogicalType::List()); | |
1512 | SetParquetSchema( | |
1513 | GroupNode::Make("parent", Repetition::REQUIRED, | |
1514 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1515 | LogicalType::List())); | |
1516 | ||
1517 | LevelVector def_levels = {0, 1, 2, 2, 2}; | |
1518 | LevelVector rep_levels = {0, 0, 1, 0, 2}; | |
1519 | std::vector<int32_t> values = {4, 5, 6}; | |
1520 | ||
1521 | auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false); | |
1522 | auto expected = ArrayFromJSON(type, "[[], [[], [4]], [[5, 6]]]"); | |
1523 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1524 | } | |
1525 | ||
1526 | TEST_F(TestReconstructColumn, ListList2) { | |
1527 | // Arrow schema: list(list(int32 not null) not null) | |
1528 | auto inner_list = GroupNode::Make( | |
1529 | "element", Repetition::REQUIRED, | |
1530 | {GroupNode::Make( | |
1531 | "list", Repetition::REPEATED, | |
1532 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
1533 | LogicalType::List()); | |
1534 | SetParquetSchema( | |
1535 | GroupNode::Make("parent", Repetition::OPTIONAL, | |
1536 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1537 | LogicalType::List())); | |
1538 | ||
1539 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
1540 | LevelVector rep_levels = {0, 0, 0, 1, 0, 2}; | |
1541 | std::vector<int32_t> values = {4, 5, 6}; | |
1542 | ||
1543 | auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false); | |
1544 | auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5, 6]]]"); | |
1545 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1546 | } | |
1547 | ||
1548 | TEST_F(TestReconstructColumn, ListList3) { | |
1549 | // Arrow schema: list(list(int32 not null)) not null | |
1550 | auto inner_list = GroupNode::Make( | |
1551 | "element", Repetition::OPTIONAL, | |
1552 | {GroupNode::Make( | |
1553 | "list", Repetition::REPEATED, | |
1554 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
1555 | LogicalType::List()); | |
1556 | SetParquetSchema( | |
1557 | GroupNode::Make("parent", Repetition::REQUIRED, | |
1558 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1559 | LogicalType::List())); | |
1560 | ||
1561 | LevelVector def_levels = {0, 1, 2, 3, 3, 3}; | |
1562 | LevelVector rep_levels = {0, 0, 1, 0, 1, 2}; | |
1563 | std::vector<int32_t> values = {4, 5, 6}; | |
1564 | ||
1565 | auto type = List(List(int32(), /*nullable=*/false)); | |
1566 | auto expected = ArrayFromJSON(type, "[[], [null, []], [[4], [5, 6]]]"); | |
1567 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1568 | } | |
1569 | ||
1570 | TEST_F(TestReconstructColumn, ListList4) { | |
1571 | // Arrow schema: list(list(int32 not null)) | |
1572 | auto inner_list = GroupNode::Make( | |
1573 | "element", Repetition::OPTIONAL, | |
1574 | {GroupNode::Make( | |
1575 | "list", Repetition::REPEATED, | |
1576 | {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})}, | |
1577 | LogicalType::List()); | |
1578 | SetParquetSchema( | |
1579 | GroupNode::Make("parent", Repetition::OPTIONAL, | |
1580 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1581 | LogicalType::List())); | |
1582 | ||
1583 | LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4}; | |
1584 | LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 2}; | |
1585 | std::vector<int32_t> values = {4, 5, 6}; | |
1586 | ||
1587 | auto type = List(List(int32(), /*nullable=*/false)); | |
1588 | auto expected = ArrayFromJSON(type, "[null, [], [null, [], [4]], [[5, 6]]]"); | |
1589 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1590 | } | |
1591 | ||
1592 | TEST_F(TestReconstructColumn, ListList5) { | |
1593 | // Arrow schema: list(list(int32) not null) | |
1594 | auto inner_list = GroupNode::Make( | |
1595 | "element", Repetition::REQUIRED, | |
1596 | {GroupNode::Make( | |
1597 | "list", Repetition::REPEATED, | |
1598 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})}, | |
1599 | LogicalType::List()); | |
1600 | SetParquetSchema( | |
1601 | GroupNode::Make("parent", Repetition::OPTIONAL, | |
1602 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1603 | LogicalType::List())); | |
1604 | ||
1605 | LevelVector def_levels = {0, 1, 2, 4, 4, 3, 4}; | |
1606 | LevelVector rep_levels = {0, 0, 0, 1, 0, 1, 2}; | |
1607 | std::vector<int32_t> values = {4, 5, 6}; | |
1608 | ||
1609 | auto type = List(List(int32()), /*nullable=*/false); | |
1610 | auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5], [null, 6]]]"); | |
1611 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1612 | } | |
1613 | ||
1614 | TEST_F(TestReconstructColumn, ListList6) { | |
1615 | // Arrow schema: list(list(int32)) | |
1616 | auto inner_list = GroupNode::Make( | |
1617 | "element", Repetition::OPTIONAL, | |
1618 | {GroupNode::Make( | |
1619 | "list", Repetition::REPEATED, | |
1620 | {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})}, | |
1621 | LogicalType::List()); | |
1622 | SetParquetSchema( | |
1623 | GroupNode::Make("parent", Repetition::OPTIONAL, | |
1624 | {GroupNode::Make("list", Repetition::REPEATED, {inner_list})}, | |
1625 | LogicalType::List())); | |
1626 | ||
1627 | LevelVector def_levels = {0, 1, 2, 3, 4, 5, 5, 5}; | |
1628 | LevelVector rep_levels = {0, 0, 0, 1, 1, 2, 0, 2}; | |
1629 | std::vector<int32_t> values = {4, 5, 6}; | |
1630 | ||
1631 | auto type = List(List(int32())); | |
1632 | auto expected = ArrayFromJSON(type, "[null, [], [null, [], [null, 4]], [[5, 6]]]"); | |
1633 | AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values); | |
1634 | } | |
1635 | ||
1636 | // TODO legacy-list-in-struct etc.? | |
1637 | ||
1638 | } // namespace arrow | |
1639 | } // namespace parquet |