]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/arrow/reconstruct_internal_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / arrow / reconstruct_internal_test.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "parquet/arrow/path_internal.h"
19
20#include <algorithm>
21#include <memory>
22#include <utility>
23#include <vector>
24
25#include <gmock/gmock.h>
26#include <gtest/gtest.h>
27
28#include "arrow/array/concatenate.h"
29#include "arrow/chunked_array.h"
30#include "arrow/io/memory.h"
31#include "arrow/result.h"
32#include "arrow/testing/gtest_util.h"
33#include "arrow/type_fwd.h"
34#include "arrow/util/checked_cast.h"
35#include "arrow/util/logging.h"
36
37#include "parquet/arrow/reader.h"
38#include "parquet/arrow/schema.h"
39#include "parquet/column_writer.h"
40#include "parquet/file_writer.h"
41#include "parquet/properties.h"
42
43using arrow::Array;
44using arrow::ArrayFromJSON;
45using arrow::AssertArraysEqual;
46using arrow::ChunkedArray;
47using arrow::DataType;
48using arrow::field;
49using arrow::int32;
50using arrow::int64;
51using arrow::list;
52using arrow::MemoryPool;
53using arrow::Result;
54using arrow::Status;
55using arrow::struct_;
56using arrow::internal::checked_cast;
57using arrow::internal::checked_pointer_cast;
58using arrow::io::BufferOutputStream;
59using arrow::io::BufferReader;
60
61using testing::ElementsAre;
62using testing::ElementsAreArray;
63using testing::Eq;
64using testing::NotNull;
65using testing::SizeIs;
66
67namespace parquet {
68namespace arrow {
69
70using parquet::schema::GroupNode;
71using parquet::schema::NodePtr;
72using parquet::schema::PrimitiveNode;
73
74using ParquetType = parquet::Type::type;
75template <ParquetType T>
76using ParquetTraits = parquet::type_traits<T>;
77
78using LevelVector = std::vector<int16_t>;
79// For readability
80using DefLevels = LevelVector;
81using RepLevels = LevelVector;
82using Int32Vector = std::vector<int32_t>;
83using Int64Vector = std::vector<int64_t>;
84
85// A Parquet file builder that allows writing values one leaf column at a time
86class FileBuilder {
87 public:
88 static Result<std::shared_ptr<FileBuilder>> Make(const NodePtr& group_node,
89 int num_columns) {
90 auto self = std::make_shared<FileBuilder>();
91 RETURN_NOT_OK(self->Open(group_node, num_columns));
92 return self;
93 }
94
95 Result<std::shared_ptr<Buffer>> Finish() {
96 DCHECK_EQ(column_index_, num_columns_);
97 row_group_writer_->Close();
98 file_writer_->Close();
99 return stream_->Finish();
100 }
101
102 // Write a leaf (primitive) column
103 template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
104 Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels,
105 const std::vector<C_TYPE>& values) {
106 auto column_writer = row_group_writer_->NextColumn();
107 auto column_descr = column_writer->descr();
108 const int16_t max_def_level = column_descr->max_definition_level();
109 const int16_t max_rep_level = column_descr->max_repetition_level();
110 CheckTestedLevels(def_levels, max_def_level);
111 CheckTestedLevels(rep_levels, max_rep_level);
112
113 auto typed_writer =
114 checked_cast<TypedColumnWriter<PhysicalType<TYPE>>*>(column_writer);
115
116 const int64_t num_values = static_cast<int64_t>(
117 (max_def_level > 0) ? def_levels.size()
118 : (max_rep_level > 0) ? rep_levels.size() : values.size());
119 const int64_t values_written = typed_writer->WriteBatch(
120 num_values, LevelPointerOrNull(def_levels, max_def_level),
121 LevelPointerOrNull(rep_levels, max_rep_level), values.data());
122 DCHECK_EQ(values_written, static_cast<int64_t>(values.size())); // Sanity check
123
124 column_writer->Close();
125 ++column_index_;
126 return Status::OK();
127 }
128
129 protected:
130 Status Open(const NodePtr& group_node, int num_columns) {
131 ARROW_ASSIGN_OR_RAISE(stream_, BufferOutputStream::Create());
132 file_writer_ =
133 ParquetFileWriter::Open(stream_, checked_pointer_cast<GroupNode>(group_node));
134 row_group_writer_ = file_writer_->AppendRowGroup();
135 num_columns_ = num_columns;
136 column_index_ = 0;
137 return Status::OK();
138 }
139
140 void CheckTestedLevels(const LevelVector& levels, int16_t max_level) {
141 // Tests are expected to exercise all possible levels in [0, max_level]
142 if (!levels.empty()) {
143 const int16_t max_seen_level = *std::max_element(levels.begin(), levels.end());
144 DCHECK_EQ(max_seen_level, max_level);
145 }
146 }
147
148 const int16_t* LevelPointerOrNull(const LevelVector& levels, int16_t max_level) {
149 if (max_level > 0) {
150 DCHECK_GT(levels.size(), 0);
151 return levels.data();
152 } else {
153 DCHECK_EQ(levels.size(), 0);
154 return nullptr;
155 }
156 }
157
158 std::shared_ptr<BufferOutputStream> stream_;
159 std::unique_ptr<ParquetFileWriter> file_writer_;
160 RowGroupWriter* row_group_writer_;
161 int num_columns_;
162 int column_index_;
163};
164
165// A Parquet file tester that allows reading Arrow columns, corresponding to
166// children of the top-level group node.
167class FileTester {
168 public:
169 static Result<std::shared_ptr<FileTester>> Make(std::shared_ptr<Buffer> buffer,
170 MemoryPool* pool) {
171 auto self = std::make_shared<FileTester>();
172 RETURN_NOT_OK(self->Open(buffer, pool));
173 return self;
174 }
175
176 Result<std::shared_ptr<Array>> ReadColumn(int column_index) {
177 std::shared_ptr<ChunkedArray> column;
178 RETURN_NOT_OK(file_reader_->ReadColumn(column_index, &column));
179 return ::arrow::Concatenate(column->chunks(), pool_);
180 }
181
182 void CheckColumn(int column_index, const Array& expected) {
183 ASSERT_OK_AND_ASSIGN(const auto actual, ReadColumn(column_index));
184 ASSERT_OK(actual->ValidateFull());
185 AssertArraysEqual(expected, *actual, /*verbose=*/true);
186 }
187
188 protected:
189 Status Open(std::shared_ptr<Buffer> buffer, MemoryPool* pool) {
190 pool_ = pool;
191 return OpenFile(std::make_shared<BufferReader>(buffer), pool_, &file_reader_);
192 }
193
194 MemoryPool* pool_;
195 std::unique_ptr<FileReader> file_reader_;
196};
197
198class TestReconstructColumn : public testing::Test {
199 public:
200 void SetUp() override { pool_ = ::arrow::default_memory_pool(); }
201
202 // Write the next leaf (primitive) column
203 template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
204 Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels,
205 const std::vector<C_TYPE>& values) {
206 if (!builder_) {
207 ARROW_ASSIGN_OR_RAISE(builder_,
208 FileBuilder::Make(group_node_, descriptor_->num_columns()));
209 }
210 return builder_->WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values);
211 }
212
213 template <typename C_TYPE>
214 Status WriteInt32Column(const LevelVector& def_levels, const LevelVector& rep_levels,
215 const std::vector<C_TYPE>& values) {
216 return WriteColumn<ParquetType::INT32>(def_levels, rep_levels, values);
217 }
218
219 template <typename C_TYPE>
220 Status WriteInt64Column(const LevelVector& def_levels, const LevelVector& rep_levels,
221 const std::vector<C_TYPE>& values) {
222 return WriteColumn<ParquetType::INT64>(def_levels, rep_levels, values);
223 }
224
225 // Read a Arrow column and check its values
226 void CheckColumn(int column_index, const Array& expected) {
227 if (!tester_) {
228 ASSERT_OK_AND_ASSIGN(auto buffer, builder_->Finish());
229 ASSERT_OK_AND_ASSIGN(tester_, FileTester::Make(buffer, pool_));
230 }
231 tester_->CheckColumn(column_index, expected);
232 }
233
234 void CheckColumn(const Array& expected) { CheckColumn(/*column_index=*/0, expected); }
235
236 // One-column shortcut
237 template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
238 void AssertReconstruct(const Array& expected, const LevelVector& def_levels,
239 const LevelVector& rep_levels,
240 const std::vector<C_TYPE>& values) {
241 ASSERT_OK((WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values)));
242 CheckColumn(/*column_index=*/0, expected);
243 }
244
245 ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
246 descriptor_.reset(new SchemaDescriptor());
247 manifest_.reset(new SchemaManifest());
248 group_node_ = GroupNode::Make("root", Repetition::REQUIRED, {column});
249 descriptor_->Init(group_node_);
250 return SchemaManifest::Make(descriptor_.get(),
251 std::shared_ptr<const ::arrow::KeyValueMetadata>(),
252 ArrowReaderProperties(), manifest_.get());
253 }
254
255 void SetParquetSchema(const NodePtr& column) {
256 ASSERT_OK(MaybeSetParquetSchema(column));
257 }
258
259 protected:
260 MemoryPool* pool_;
261 NodePtr group_node_;
262 std::unique_ptr<SchemaDescriptor> descriptor_;
263 std::unique_ptr<SchemaManifest> manifest_;
264
265 std::shared_ptr<FileBuilder> builder_;
266 std::shared_ptr<FileTester> tester_;
267};
268
269static std::shared_ptr<DataType> OneFieldStruct(const std::string& name,
270 std::shared_ptr<DataType> type,
271 bool nullable = true) {
272 return struct_({field(name, type, nullable)});
273}
274
275static std::shared_ptr<DataType> List(std::shared_ptr<DataType> type,
276 bool nullable = true) {
277 // TODO should field name "element" (Parquet convention for List nodes)
278 // be changed to "item" (Arrow convention for List types)?
279 return list(field("element", type, nullable));
280}
281
282//
283// Primitive columns with no intermediate group node
284//
285
286TEST_F(TestReconstructColumn, PrimitiveOptional) {
287 SetParquetSchema(
288 PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::INT32));
289
290 LevelVector def_levels = {1, 0, 1, 1};
291 LevelVector rep_levels = {};
292 std::vector<int32_t> values = {4, 5, 6};
293
294 auto expected = ArrayFromJSON(int32(), "[4, null, 5, 6]");
295 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
296}
297
298TEST_F(TestReconstructColumn, PrimitiveRequired) {
299 SetParquetSchema(
300 PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::INT32));
301
302 LevelVector def_levels = {};
303 LevelVector rep_levels = {};
304 std::vector<int32_t> values = {4, 5, 6};
305
306 auto expected = ArrayFromJSON(int32(), "[4, 5, 6]");
307 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
308}
309
310TEST_F(TestReconstructColumn, PrimitiveRepeated) {
311 // Arrow schema: list(int32 not null) not null
312 this->SetParquetSchema(
313 PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::INT32));
314
315 LevelVector def_levels = {0, 1, 1, 1};
316 LevelVector rep_levels = {0, 0, 1, 0};
317 std::vector<int32_t> values = {4, 5, 6};
318
319 auto expected = ArrayFromJSON(list(field("node_name", int32(), /*nullable=*/false)),
320 "[[], [4, 5], [6]]");
321 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
322}
323
324//
325// Struct encodings (one field each)
326//
327
328TEST_F(TestReconstructColumn, NestedRequiredRequired) {
329 // Arrow schema: struct(a: int32 not null) not null
330 SetParquetSchema(GroupNode::Make(
331 "parent", Repetition::REQUIRED,
332 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)}));
333
334 LevelVector def_levels = {};
335 LevelVector rep_levels = {};
336 std::vector<int32_t> values = {4, 5, 6};
337
338 auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false),
339 R"([{"a": 4}, {"a": 5}, {"a": 6}])");
340 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
341}
342
343TEST_F(TestReconstructColumn, NestedOptionalRequired) {
344 // Arrow schema: struct(a: int32 not null)
345 SetParquetSchema(GroupNode::Make(
346 "parent", Repetition::OPTIONAL,
347 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)}));
348
349 LevelVector def_levels = {0, 1, 1, 1};
350 LevelVector rep_levels = {};
351 std::vector<int32_t> values = {4, 5, 6};
352
353 auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false),
354 R"([null, {"a": 4}, {"a": 5}, {"a": 6}])");
355 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
356}
357
358TEST_F(TestReconstructColumn, NestedRequiredOptional) {
359 // Arrow schema: struct(a: int32) not null
360 SetParquetSchema(GroupNode::Make(
361 "parent", Repetition::REQUIRED,
362 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)}));
363
364 LevelVector def_levels = {0, 1, 1, 1};
365 LevelVector rep_levels = {};
366 std::vector<int32_t> values = {4, 5, 6};
367
368 auto expected = ArrayFromJSON(OneFieldStruct("a", int32()),
369 R"([{"a": null}, {"a": 4}, {"a": 5}, {"a": 6}])");
370 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
371}
372
373TEST_F(TestReconstructColumn, NestedOptionalOptional) {
374 // Arrow schema: struct(a: int32)
375 SetParquetSchema(GroupNode::Make(
376 "parent", Repetition::OPTIONAL,
377 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)}));
378
379 LevelVector def_levels = {0, 1, 2, 2};
380 LevelVector rep_levels = {};
381 std::vector<int32_t> values = {4, 5};
382
383 auto expected = ArrayFromJSON(OneFieldStruct("a", int32()),
384 R"([null, {"a": null}, {"a": 4}, {"a": 5}])");
385 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
386}
387
388//
389// Nested struct encodings (one field each)
390//
391
392TEST_F(TestReconstructColumn, NestedRequiredRequiredRequired) {
393 // Arrow schema: struct(a: struct(b: int32 not null) not null) not null
394 SetParquetSchema(GroupNode::Make(
395 "parent", Repetition::REQUIRED,
396 {GroupNode::Make(
397 "a", Repetition::REQUIRED,
398 {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})}));
399
400 LevelVector def_levels = {};
401 LevelVector rep_levels = {};
402 std::vector<int32_t> values = {4, 5, 6};
403
404 auto expected =
405 ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false), false),
406 R"([{"a": {"b": 4}},
407 {"a": {"b": 5}},
408 {"a": {"b": 6}}
409 ])");
410 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
411}
412
413TEST_F(TestReconstructColumn, NestedRequiredOptionalRequired) {
414 // Arrow schema: struct(a: struct(b: int32 not null)) not null
415 SetParquetSchema(GroupNode::Make(
416 "parent", Repetition::REQUIRED,
417 {GroupNode::Make(
418 "a", Repetition::OPTIONAL,
419 {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})}));
420
421 LevelVector def_levels = {1, 0, 1, 1};
422 LevelVector rep_levels = {};
423 std::vector<int32_t> values = {4, 5, 6};
424
425 auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false)),
426 R"([{"a": {"b": 4}},
427 {"a": null},
428 {"a": {"b": 5}},
429 {"a": {"b": 6}}
430 ])");
431 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
432}
433
434TEST_F(TestReconstructColumn, NestedOptionalRequiredOptional) {
435 // Arrow schema: struct(a: struct(b: int32) not null)
436 SetParquetSchema(GroupNode::Make(
437 "parent", Repetition::OPTIONAL,
438 {GroupNode::Make(
439 "a", Repetition::REQUIRED,
440 {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})}));
441
442 LevelVector def_levels = {1, 2, 0, 2, 2};
443 LevelVector rep_levels = {};
444 std::vector<int32_t> values = {4, 5, 6};
445
446 auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32()), false),
447 R"([{"a": {"b": null}},
448 {"a": {"b": 4}},
449 null,
450 {"a": {"b": 5}},
451 {"a": {"b": 6}}
452 ])");
453 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
454}
455
456TEST_F(TestReconstructColumn, NestedOptionalOptionalOptional) {
457 // Arrow schema: struct(a: struct(b: int32) not null)
458 SetParquetSchema(GroupNode::Make(
459 "parent", Repetition::OPTIONAL,
460 {GroupNode::Make(
461 "a", Repetition::OPTIONAL,
462 {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})}));
463
464 LevelVector def_levels = {1, 2, 0, 3, 3, 3};
465 LevelVector rep_levels = {};
466 std::vector<int32_t> values = {4, 5, 6};
467
468 auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32())),
469 R"([{"a": null},
470 {"a": {"b": null}},
471 null,
472 {"a": {"b": 4}},
473 {"a": {"b": 5}},
474 {"a": {"b": 6}}
475 ])");
476 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
477}
478
479//
480// Struct encodings (two fields)
481//
482
483TEST_F(TestReconstructColumn, NestedTwoFields1) {
484 // Arrow schema: struct(a: int32 not null, b: int64 not null) not null
485 SetParquetSchema(GroupNode::Make(
486 "parent", Repetition::REQUIRED,
487 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
488 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
489
490 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
491 ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
492
493 auto type = struct_(
494 {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)});
495 auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": 7},
496 {"a": 5, "b": 8},
497 {"a": 6, "b": 9}])");
498
499 CheckColumn(/*column_index=*/0, *expected);
500}
501
502TEST_F(TestReconstructColumn, NestedTwoFields2) {
503 // Arrow schema: struct(a: int32 not null, b: int64) not null
504 SetParquetSchema(GroupNode::Make(
505 "parent", Repetition::REQUIRED,
506 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
507 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)}));
508
509 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
510 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
511
512 auto type = struct_({field("a", int32(), /*nullable=*/false), field("b", int64())});
513 auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": null},
514 {"a": 5, "b": 7},
515 {"a": 6, "b": 8}])");
516
517 CheckColumn(/*column_index=*/0, *expected);
518}
519
520TEST_F(TestReconstructColumn, NestedTwoFields3) {
521 // Arrow schema: struct(a: int32 not null, b: int64 not null)
522 SetParquetSchema(GroupNode::Make(
523 "parent", Repetition::OPTIONAL,
524 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
525 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
526
527 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
528 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
529
530 auto type = struct_(
531 {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)});
532 auto expected = ArrayFromJSON(type, R"([null,
533 {"a": 4, "b": 7},
534 {"a": 5, "b": 8}])");
535
536 CheckColumn(/*column_index=*/0, *expected);
537}
538
539TEST_F(TestReconstructColumn, NestedTwoFields4) {
540 // Arrow schema: struct(a: int32, b: int64 not null)
541 SetParquetSchema(GroupNode::Make(
542 "parent", Repetition::OPTIONAL,
543 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
544 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
545
546 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4}));
547 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
548
549 auto type = struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)});
550 auto expected = ArrayFromJSON(type, R"([null,
551 {"a": null, "b": 7},
552 {"a": 4, "b": 8}])");
553
554 CheckColumn(/*column_index=*/0, *expected);
555}
556
557TEST_F(TestReconstructColumn, NestedTwoFields5) {
558 // Arrow schema: struct(a: int32, b: int64)
559 SetParquetSchema(GroupNode::Make(
560 "parent", Repetition::OPTIONAL,
561 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
562 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)}));
563
564 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4}));
565 ASSERT_OK(WriteInt64Column(DefLevels{0, 2, 1}, RepLevels{}, Int64Vector{7}));
566
567 auto type = struct_({field("a", int32()), field("b", int64())});
568 auto expected = ArrayFromJSON(type, R"([null,
569 {"a": null, "b": 7},
570 {"a": 4, "b": null}])");
571
572 CheckColumn(/*column_index=*/0, *expected);
573}
574
575//
576// Nested struct encodings (two fields)
577//
578
579TEST_F(TestReconstructColumn, NestedNestedTwoFields1) {
580 // Arrow schema: struct(a: struct(aa: int32 not null,
581 // ab: int64 not null) not null,
582 // b: int32 not null) not null
583 SetParquetSchema(GroupNode::Make(
584 "parent", Repetition::REQUIRED,
585 {GroupNode::Make(
586 "a", Repetition::REQUIRED,
587 {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
588 PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}),
589 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
590
591 // aa
592 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
593 // ab
594 ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
595 // b
596 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
597
598 auto type = struct_({field("a",
599 struct_({field("aa", int32(), /*nullable=*/false),
600 field("ab", int64(), /*nullable=*/false)}),
601 /*nullable=*/false),
602 field("b", int32(), /*nullable=*/false)});
603 auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10},
604 {"a": {"aa": 5, "ab": 8}, "b": 11},
605 {"a": {"aa": 6, "ab": 9}, "b": 12}])");
606
607 CheckColumn(/*column_index=*/0, *expected);
608}
609
610TEST_F(TestReconstructColumn, NestedNestedTwoFields2) {
611 // Arrow schema: struct(a: struct(aa: int32,
612 // ab: int64 not null) not null,
613 // b: int32 not null) not null
614 SetParquetSchema(GroupNode::Make(
615 "parent", Repetition::REQUIRED,
616 {GroupNode::Make(
617 "a", Repetition::REQUIRED,
618 {PrimitiveNode::Make("aa", Repetition::OPTIONAL, ParquetType::INT32),
619 PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}),
620 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
621
622 // aa
623 ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{4, 5}));
624 // ab
625 ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
626 // b
627 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
628
629 auto type = struct_(
630 {field("a",
631 struct_({field("aa", int32()), field("ab", int64(), /*nullable=*/false)}),
632 /*nullable=*/false),
633 field("b", int32(), /*nullable=*/false)});
634 auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10},
635 {"a": {"aa": null, "ab": 8}, "b": 11},
636 {"a": {"aa": 5, "ab": 9}, "b": 12}])");
637
638 CheckColumn(/*column_index=*/0, *expected);
639}
640
641TEST_F(TestReconstructColumn, NestedNestedTwoFields3) {
642 // Arrow schema: struct(a: struct(aa: int32 not null,
643 // ab: int64) not null,
644 // b: int32) not null
645 SetParquetSchema(GroupNode::Make(
646 "parent", Repetition::REQUIRED,
647 {GroupNode::Make(
648 "a", Repetition::REQUIRED,
649 {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
650 PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
651 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
652
653 // aa
654 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
655 // ab
656 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
657 // b
658 ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{10, 11}));
659
660 auto type = struct_(
661 {field("a",
662 struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}),
663 /*nullable=*/false),
664 field("b", int32())});
665 auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": null}, "b": 10},
666 {"a": {"aa": 5, "ab": 7}, "b": null},
667 {"a": {"aa": 6, "ab": 8}, "b": 11}])");
668
669 CheckColumn(/*column_index=*/0, *expected);
670}
671
672TEST_F(TestReconstructColumn, NestedNestedTwoFields4) {
673 // Arrow schema: struct(a: struct(aa: int32 not null,
674 // ab: int64),
675 // b: int32 not null) not null
676 SetParquetSchema(GroupNode::Make(
677 "parent", Repetition::REQUIRED,
678 {GroupNode::Make(
679 "a", Repetition::OPTIONAL,
680 {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
681 PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
682 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
683
684 // aa
685 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
686 // ab
687 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7}));
688 // b
689 ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
690
691 auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false),
692 field("ab", int64())})),
693 field("b", int32(), /*nullable=*/false)});
694 auto expected = ArrayFromJSON(type, R"([{"a": null, "b": 10},
695 {"a": {"aa": 4, "ab": null}, "b": 11},
696 {"a": {"aa": 5, "ab": 7}, "b": 12}])");
697
698 CheckColumn(/*column_index=*/0, *expected);
699}
700
701TEST_F(TestReconstructColumn, NestedNestedTwoFields5) {
702 // Arrow schema: struct(a: struct(aa: int32 not null,
703 // ab: int64) not null,
704 // b: int32)
705 SetParquetSchema(GroupNode::Make(
706 "parent", Repetition::OPTIONAL,
707 {GroupNode::Make(
708 "a", Repetition::REQUIRED,
709 {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
710 PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
711 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
712
713 // aa
714 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
715 // ab
716 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7}));
717 // b
718 ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1}, RepLevels{}, Int32Vector{10}));
719
720 auto type = struct_(
721 {field("a",
722 struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}),
723 /*nullable=*/false),
724 field("b", int32())});
725 auto expected = ArrayFromJSON(type, R"([null,
726 {"a": {"aa": 4, "ab": null}, "b": 10},
727 {"a": {"aa": 5, "ab": 7}, "b": null}])");
728
729 CheckColumn(/*column_index=*/0, *expected);
730}
731
732TEST_F(TestReconstructColumn, NestedNestedTwoFields6) {
733 // Arrow schema: struct(a: struct(aa: int32 not null,
734 // ab: int64),
735 // b: int32)
736 SetParquetSchema(GroupNode::Make(
737 "parent", Repetition::OPTIONAL,
738 {GroupNode::Make(
739 "a", Repetition::OPTIONAL,
740 {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
741 PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
742 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
743
744 // aa
745 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2}, RepLevels{}, Int32Vector{4, 5}));
746 // ab
747 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 3}, RepLevels{}, Int64Vector{7}));
748 // b
749 ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{}, Int32Vector{10, 11}));
750
751 auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false),
752 field("ab", int64())})),
753 field("b", int32())});
754 auto expected = ArrayFromJSON(type, R"([null,
755 {"a": null, "b": 10},
756 {"a": {"aa": 4, "ab": null}, "b": null},
757 {"a": {"aa": 5, "ab": 7}, "b": 11}])");
758
759 CheckColumn(/*column_index=*/0, *expected);
760}
761
762//
763// Three-level list encodings
764//
765
766TEST_F(TestReconstructColumn, ThreeLevelListRequiredRequired) {
767 // Arrow schema: list(int32 not null) not null
768 SetParquetSchema(GroupNode::Make(
769 "parent", Repetition::REQUIRED,
770 {GroupNode::Make(
771 "list", Repetition::REPEATED,
772 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
773 LogicalType::List()));
774
775 LevelVector def_levels = {0, 1, 1, 1};
776 LevelVector rep_levels = {0, 0, 1, 0};
777 std::vector<int32_t> values = {4, 5, 6};
778
779 // TODO should field name "element" (Parquet convention for List nodes)
780 // be changed to "item" (Arrow convention for List types)?
781 auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]");
782 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
783}
784
785TEST_F(TestReconstructColumn, ThreeLevelListOptionalRequired) {
786 // Arrow schema: list(int32 not null)
787 SetParquetSchema(GroupNode::Make(
788 "parent", Repetition::OPTIONAL,
789 {GroupNode::Make(
790 "list", Repetition::REPEATED,
791 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
792 LogicalType::List()));
793
794 LevelVector def_levels = {0, 1, 2, 2, 2};
795 LevelVector rep_levels = {0, 0, 0, 1, 0};
796 std::vector<int32_t> values = {4, 5, 6};
797
798 auto expected =
799 ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]");
800 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
801}
802
803TEST_F(TestReconstructColumn, ThreeLevelListRequiredOptional) {
804 // Arrow schema: list(int32) not null
805 SetParquetSchema(GroupNode::Make(
806 "parent", Repetition::REQUIRED,
807 {GroupNode::Make(
808 "list", Repetition::REPEATED,
809 {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
810 LogicalType::List()));
811
812 LevelVector def_levels = {0, 1, 2, 2, 2};
813 LevelVector rep_levels = {0, 0, 1, 0, 1};
814 std::vector<int32_t> values = {4, 5, 6};
815
816 auto expected = ArrayFromJSON(List(int32()), "[[], [null, 4], [5, 6]]");
817 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
818}
819
820TEST_F(TestReconstructColumn, ThreeLevelListOptionalOptional) {
821 // Arrow schema: list(int32)
822 SetParquetSchema(GroupNode::Make(
823 "parent", Repetition::OPTIONAL,
824 {GroupNode::Make(
825 "list", Repetition::REPEATED,
826 {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
827 LogicalType::List()));
828
829 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
830 LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
831 std::vector<int32_t> values = {4, 5, 6};
832
833 auto expected = ArrayFromJSON(List(int32()), "[null, [], [null, 4], [5, 6]]");
834 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
835}
836
837//
838// Legacy list encodings
839//
840
841TEST_F(TestReconstructColumn, TwoLevelListRequired) {
842 // Arrow schema: list(int32 not null) not null
843 SetParquetSchema(GroupNode::Make(
844 "parent", Repetition::REQUIRED,
845 {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)},
846 LogicalType::List()));
847
848 LevelVector def_levels = {0, 1, 1, 1};
849 LevelVector rep_levels = {0, 0, 1, 0};
850 std::vector<int32_t> values = {4, 5, 6};
851
852 // TODO should field name "element" (Parquet convention for List nodes)
853 // be changed to "item" (Arrow convention for List types)?
854 auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]");
855 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
856}
857
858TEST_F(TestReconstructColumn, TwoLevelListOptional) {
859 // Arrow schema: list(int32 not null)
860 SetParquetSchema(GroupNode::Make(
861 "parent", Repetition::OPTIONAL,
862 {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)},
863 LogicalType::List()));
864
865 LevelVector def_levels = {0, 1, 2, 2, 2};
866 LevelVector rep_levels = {0, 0, 0, 1, 0};
867 std::vector<int32_t> values = {4, 5, 6};
868
869 auto expected =
870 ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]");
871 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
872}
873
874//
875// List-in-struct
876//
877
878TEST_F(TestReconstructColumn, NestedList1) {
879 // Arrow schema: struct(a: list(int32 not null) not null) not null
880 SetParquetSchema(GroupNode::Make(
881 "a", Repetition::REQUIRED,
882 {GroupNode::Make(
883 "p", Repetition::REQUIRED,
884 {GroupNode::Make("list", Repetition::REPEATED,
885 {PrimitiveNode::Make("element", Repetition::REQUIRED,
886 ParquetType::INT32)})},
887 LogicalType::List())}));
888
889 LevelVector def_levels = {0, 1, 1, 1};
890 LevelVector rep_levels = {0, 0, 1, 0};
891 std::vector<int32_t> values = {4, 5, 6};
892
893 auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false),
894 /*nullable=*/false);
895 auto expected = ArrayFromJSON(type, R"([{"p": []},
896 {"p": [4, 5]},
897 {"p": [6]}])");
898 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
899}
900
901TEST_F(TestReconstructColumn, NestedList2) {
902 // Arrow schema: struct(a: list(int32 not null) not null)
903 SetParquetSchema(GroupNode::Make(
904 "a", Repetition::OPTIONAL,
905 {GroupNode::Make(
906 "p", Repetition::REQUIRED,
907 {GroupNode::Make("list", Repetition::REPEATED,
908 {PrimitiveNode::Make("element", Repetition::REQUIRED,
909 ParquetType::INT32)})},
910 LogicalType::List())}));
911
912 LevelVector def_levels = {0, 1, 2, 2, 2};
913 LevelVector rep_levels = {0, 0, 0, 1, 0};
914 std::vector<int32_t> values = {4, 5, 6};
915
916 auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false),
917 /*nullable=*/false);
918 auto expected = ArrayFromJSON(type, R"([null,
919 {"p": []},
920 {"p": [4, 5]},
921 {"p": [6]}])");
922 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
923}
924
925TEST_F(TestReconstructColumn, NestedList3) {
926 // Arrow schema: struct(a: list(int32 not null)) not null
927 SetParquetSchema(GroupNode::Make(
928 "a", Repetition::REQUIRED, // column name (column a is a struct of)
929 {GroupNode::Make(
930 "p", Repetition::OPTIONAL, // name in struct
931 {GroupNode::Make("list", Repetition::REPEATED,
932 {PrimitiveNode::Make("element", Repetition::REQUIRED,
933 ParquetType::INT32)})},
934 LogicalType::List())}));
935
936 LevelVector def_levels = {0, 1, 2, 2, 2};
937 LevelVector rep_levels = {0, 0, 0, 1, 0};
938 std::vector<int32_t> values = {4, 5, 6};
939
940 auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false));
941 auto expected = ArrayFromJSON(type, R"([{"p": null},
942 {"p": []},
943 {"p": [4, 5]},
944 {"p": [6]}])");
945 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
946}
947
948TEST_F(TestReconstructColumn, NestedList4) {
949 // Arrow schema: struct(a: list(int32 not null))
950 SetParquetSchema(GroupNode::Make(
951 "a", Repetition::OPTIONAL,
952 {GroupNode::Make(
953 "p", Repetition::OPTIONAL,
954 {GroupNode::Make("list", Repetition::REPEATED,
955 {PrimitiveNode::Make("element", Repetition::REQUIRED,
956 ParquetType::INT32)})},
957 LogicalType::List())}));
958
959 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
960 LevelVector rep_levels = {0, 0, 0, 0, 1, 0};
961 std::vector<int32_t> values = {4, 5, 6};
962
963 auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false));
964 auto expected = ArrayFromJSON(type, R"([null,
965 {"p": null},
966 {"p": []},
967 {"p": [4, 5]},
968 {"p": [6]}])");
969 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
970}
971
972TEST_F(TestReconstructColumn, NestedList5) {
973 // Arrow schema: struct(a: list(int32) not null)
974 SetParquetSchema(GroupNode::Make(
975 "a", Repetition::OPTIONAL,
976 {GroupNode::Make(
977 "p", Repetition::REQUIRED,
978 {GroupNode::Make("list", Repetition::REPEATED,
979 {PrimitiveNode::Make("element", Repetition::OPTIONAL,
980 ParquetType::INT32)})},
981 LogicalType::List())}));
982
983 LevelVector def_levels = {0, 1, 3, 2, 3, 3};
984 LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
985 std::vector<int32_t> values = {4, 5, 6};
986
987 auto type = OneFieldStruct("p", List(int32()), /*nullable=*/false);
988 auto expected = ArrayFromJSON(type, R"([null,
989 {"p": []},
990 {"p": [4, null]},
991 {"p": [5, 6]}])");
992 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
993}
994
995TEST_F(TestReconstructColumn, NestedList6) {
996 // Arrow schema: struct(a: list(int32))
997 SetParquetSchema(GroupNode::Make(
998 "a", Repetition::OPTIONAL,
999 {GroupNode::Make(
1000 "p", Repetition::OPTIONAL,
1001 {GroupNode::Make("list", Repetition::REPEATED,
1002 {PrimitiveNode::Make("element", Repetition::OPTIONAL,
1003 ParquetType::INT32)})},
1004 LogicalType::List())}));
1005
1006 LevelVector def_levels = {0, 1, 2, 4, 3, 4, 4};
1007 LevelVector rep_levels = {0, 0, 0, 0, 1, 0, 1};
1008 std::vector<int32_t> values = {4, 5, 6};
1009
1010 auto type = OneFieldStruct("p", List(int32()));
1011 auto expected = ArrayFromJSON(type, R"([null,
1012 {"p": null},
1013 {"p": []},
1014 {"p": [4, null]},
1015 {"p": [5, 6]}])");
1016 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1017}
1018
1019//
1020// Struct-in-list
1021//
1022
1023TEST_F(TestReconstructColumn, ListNested1) {
1024 // Arrow schema: list(struct(a: int32 not null) not null) not null
1025 SetParquetSchema(GroupNode::Make(
1026 "parent", Repetition::REQUIRED,
1027 {GroupNode::Make("list", Repetition::REPEATED,
1028 {GroupNode::Make("element", Repetition::REQUIRED,
1029 {PrimitiveNode::Make("a", Repetition::REQUIRED,
1030 ParquetType::INT32)})})},
1031 LogicalType::List()));
1032
1033 LevelVector def_levels = {0, 1, 1, 1};
1034 LevelVector rep_levels = {0, 0, 1, 0};
1035 std::vector<int32_t> values = {4, 5, 6};
1036
1037 auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false),
1038 /*nullable=*/false);
1039 auto expected = ArrayFromJSON(type,
1040 R"([[],
1041 [{"a": 4}, {"a": 5}],
1042 [{"a": 6}]])");
1043 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1044}
1045
1046TEST_F(TestReconstructColumn, ListNested2) {
1047 // Arrow schema: list(struct(a: int32 not null) not null)
1048 SetParquetSchema(GroupNode::Make(
1049 "parent", Repetition::OPTIONAL,
1050 {GroupNode::Make("list", Repetition::REPEATED,
1051 {GroupNode::Make("element", Repetition::REQUIRED,
1052 {PrimitiveNode::Make("a", Repetition::REQUIRED,
1053 ParquetType::INT32)})})},
1054 LogicalType::List()));
1055
1056 LevelVector def_levels = {0, 1, 2, 2, 2};
1057 LevelVector rep_levels = {0, 0, 0, 1, 0};
1058 std::vector<int32_t> values = {4, 5, 6};
1059
1060 auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false),
1061 /*nullable=*/false);
1062 auto expected = ArrayFromJSON(type,
1063 R"([null,
1064 [],
1065 [{"a": 4}, {"a": 5}],
1066 [{"a": 6}]])");
1067 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1068}
1069
1070TEST_F(TestReconstructColumn, ListNested3) {
1071 // Arrow schema: list(struct(a: int32 not null)) not null
1072 SetParquetSchema(GroupNode::Make(
1073 "parent", Repetition::REQUIRED,
1074 {GroupNode::Make("list", Repetition::REPEATED,
1075 {GroupNode::Make("element", Repetition::OPTIONAL,
1076 {PrimitiveNode::Make("a", Repetition::REQUIRED,
1077 ParquetType::INT32)})})},
1078 LogicalType::List()));
1079
1080 LevelVector def_levels = {0, 1, 2, 2, 2};
1081 LevelVector rep_levels = {0, 0, 1, 1, 0};
1082 std::vector<int32_t> values = {4, 5, 6};
1083
1084 auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false));
1085 auto expected = ArrayFromJSON(type,
1086 R"([[],
1087 [null, {"a": 4}, {"a": 5}],
1088 [{"a": 6}]])");
1089 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1090}
1091
1092TEST_F(TestReconstructColumn, ListNested4) {
1093 // Arrow schema: list(struct(a: int32 not null))
1094 SetParquetSchema(GroupNode::Make(
1095 "parent", Repetition::OPTIONAL,
1096 {GroupNode::Make("list", Repetition::REPEATED,
1097 {GroupNode::Make("element", Repetition::OPTIONAL,
1098 {PrimitiveNode::Make("a", Repetition::REQUIRED,
1099 ParquetType::INT32)})})},
1100 LogicalType::List()));
1101
1102 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1103 LevelVector rep_levels = {0, 0, 0, 1, 1, 0};
1104 std::vector<int32_t> values = {4, 5, 6};
1105
1106 auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false));
1107 auto expected = ArrayFromJSON(type,
1108 R"([null,
1109 [],
1110 [null, {"a": 4}, {"a": 5}],
1111 [{"a": 6}]])");
1112 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1113}
1114
1115TEST_F(TestReconstructColumn, ListNested5) {
1116 // Arrow schema: list(struct(a: int32) not null)
1117 SetParquetSchema(GroupNode::Make(
1118 "parent", Repetition::OPTIONAL,
1119 {GroupNode::Make("list", Repetition::REPEATED,
1120 {GroupNode::Make("element", Repetition::REQUIRED,
1121 {PrimitiveNode::Make("a", Repetition::OPTIONAL,
1122 ParquetType::INT32)})})},
1123 LogicalType::List()));
1124
1125 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1126 LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
1127 std::vector<int32_t> values = {4, 5, 6};
1128
1129 auto type = List(OneFieldStruct("a", int32()),
1130 /*nullable=*/false);
1131 auto expected = ArrayFromJSON(type,
1132 R"([null,
1133 [],
1134 [{"a": null}, {"a": 4}],
1135 [{"a": 5}, {"a": 6}]])");
1136 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1137}
1138
1139TEST_F(TestReconstructColumn, ListNested6) {
1140 // Arrow schema: list(struct(a: int32))
1141 SetParquetSchema(GroupNode::Make(
1142 "parent", Repetition::OPTIONAL,
1143 {GroupNode::Make("list", Repetition::REPEATED,
1144 {GroupNode::Make("element", Repetition::OPTIONAL,
1145 {PrimitiveNode::Make("a", Repetition::OPTIONAL,
1146 ParquetType::INT32)})})},
1147 LogicalType::List()));
1148
1149 LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4};
1150 LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 1};
1151 std::vector<int32_t> values = {4, 5, 6};
1152
1153 auto type = List(OneFieldStruct("a", int32()));
1154 auto expected = ArrayFromJSON(type,
1155 R"([null,
1156 [],
1157 [null, {"a": null}, {"a": 4}],
1158 [{"a": 5}, {"a": 6}]])");
1159 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1160}
1161
1162//
1163// Struct (two fields)-in-list
1164//
1165
1166TEST_F(TestReconstructColumn, ListNestedTwoFields1) {
1167 // Arrow schema: list(struct(a: int32 not null,
1168 // b: int64 not null) not null) not null
1169 SetParquetSchema(GroupNode::Make(
1170 "parent", Repetition::REQUIRED,
1171 {GroupNode::Make(
1172 "list", Repetition::REPEATED,
1173 {GroupNode::Make(
1174 "element", Repetition::REQUIRED,
1175 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
1176 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1177 LogicalType::List()));
1178
1179 // a
1180 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1181 Int32Vector{4, 5, 6}));
1182 // b
1183 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1184 Int64Vector{7, 8, 9}));
1185
1186 auto type = List(struct_({field("a", int32(), /*nullable=*/false),
1187 field("b", int64(), /*nullable=*/false)}),
1188 /*nullable=*/false);
1189 auto expected = ArrayFromJSON(type,
1190 R"([[],
1191 [{"a": 4, "b": 7}, {"a": 5, "b": 8}],
1192 [{"a": 6, "b": 9}]])");
1193 CheckColumn(/*column_index=*/0, *expected);
1194}
1195
1196TEST_F(TestReconstructColumn, ListNestedTwoFields2) {
1197 // Arrow schema: list(struct(a: int32,
1198 // b: int64 not null) not null) not null
1199 SetParquetSchema(GroupNode::Make(
1200 "parent", Repetition::REQUIRED,
1201 {GroupNode::Make(
1202 "list", Repetition::REPEATED,
1203 {GroupNode::Make(
1204 "element", Repetition::REQUIRED,
1205 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1206 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1207 LogicalType::List()));
1208
1209 // a
1210 ASSERT_OK(
1211 WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{0, 0, 1, 0}, Int32Vector{4, 5}));
1212 // b
1213 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1214 Int64Vector{7, 8, 9}));
1215
1216 auto type =
1217 List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}),
1218 /*nullable=*/false);
1219 auto expected = ArrayFromJSON(type,
1220 R"([[],
1221 [{"a": 4, "b": 7}, {"a": null, "b": 8}],
1222 [{"a": 5, "b": 9}]])");
1223 CheckColumn(/*column_index=*/0, *expected);
1224}
1225
1226TEST_F(TestReconstructColumn, ListNestedTwoFields3) {
1227 // Arrow schema: list(struct(a: int32 not null,
1228 // b: int64 not null)) not null
1229 SetParquetSchema(GroupNode::Make(
1230 "parent", Repetition::REQUIRED,
1231 {GroupNode::Make(
1232 "list", Repetition::REPEATED,
1233 {GroupNode::Make(
1234 "element", Repetition::OPTIONAL,
1235 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
1236 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1237 LogicalType::List()));
1238
1239 // a
1240 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0},
1241 Int32Vector{4, 5, 6}));
1242 // b
1243 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0},
1244 Int64Vector{7, 8, 9}));
1245
1246 auto type = List(struct_({field("a", int32(), /*nullable=*/false),
1247 field("b", int64(), /*nullable=*/false)}));
1248 auto expected = ArrayFromJSON(type,
1249 R"([[],
1250 [null, {"a": 4, "b": 7}, {"a": 5, "b": 8}],
1251 [{"a": 6, "b": 9}]])");
1252 CheckColumn(/*column_index=*/0, *expected);
1253}
1254
1255TEST_F(TestReconstructColumn, ListNestedTwoFields4) {
1256 // Arrow schema: list(struct(a: int32,
1257 // b: int64 not null) not null)
1258 SetParquetSchema(GroupNode::Make(
1259 "parent", Repetition::OPTIONAL,
1260 {GroupNode::Make(
1261 "list", Repetition::REPEATED,
1262 {GroupNode::Make(
1263 "element", Repetition::REQUIRED,
1264 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1265 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1266 LogicalType::List()));
1267
1268 // a
1269 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1270 Int32Vector{4, 5}));
1271 // b
1272 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1273 Int64Vector{7, 8, 9}));
1274
1275 auto type =
1276 List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}),
1277 /*nullable=*/false);
1278 auto expected = ArrayFromJSON(type,
1279 R"([null,
1280 [],
1281 [{"a": 4, "b": 7}, {"a": null, "b": 8}],
1282 [{"a": 5, "b": 9}]])");
1283 CheckColumn(/*column_index=*/0, *expected);
1284}
1285
1286TEST_F(TestReconstructColumn, ListNestedTwoFields5) {
1287 // Arrow schema: list(struct(a: int32,
1288 // b: int64 not null))
1289 SetParquetSchema(GroupNode::Make(
1290 "parent", Repetition::OPTIONAL,
1291 {GroupNode::Make(
1292 "list", Repetition::REPEATED,
1293 {GroupNode::Make(
1294 "element", Repetition::OPTIONAL,
1295 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1296 PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1297 LogicalType::List()));
1298
1299 // a
1300 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1301 Int32Vector{4}));
1302 // b
1303 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1304 Int64Vector{7, 8}));
1305
1306 auto type =
1307 List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}));
1308 auto expected = ArrayFromJSON(type,
1309 R"([null,
1310 [],
1311 [{"a": 4, "b": 7}, null],
1312 [{"a": null, "b": 8}]])");
1313 CheckColumn(/*column_index=*/0, *expected);
1314}
1315
1316TEST_F(TestReconstructColumn, ListNestedTwoFields6) {
1317 // Arrow schema: list(struct(a: int32,
1318 // b: int64))
1319 SetParquetSchema(GroupNode::Make(
1320 "parent", Repetition::OPTIONAL,
1321 {GroupNode::Make(
1322 "list", Repetition::REPEATED,
1323 {GroupNode::Make(
1324 "element", Repetition::OPTIONAL,
1325 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1326 PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)})})},
1327 LogicalType::List()));
1328
1329 // a
1330 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1331 Int32Vector{4}));
1332 // b
1333 ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 4}, RepLevels{0, 0, 0, 1, 0},
1334 Int64Vector{7}));
1335
1336 auto type = List(struct_({field("a", int32()), field("b", int64())}));
1337 auto expected = ArrayFromJSON(type,
1338 R"([null,
1339 [],
1340 [{"a": 4, "b": null}, null],
1341 [{"a": null, "b": 7}]])");
1342 CheckColumn(/*column_index=*/0, *expected);
1343}
1344
1345//
1346// List-in-struct (two fields)
1347//
1348
1349TEST_F(TestReconstructColumn, NestedTwoFieldsList1) {
1350 // Arrow schema: struct(a: int64 not null,
1351 // b: list(int32 not null) not null
1352 // ) not null
1353 SetParquetSchema(GroupNode::Make(
1354 "parent", Repetition::REQUIRED,
1355 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64),
1356 GroupNode::Make(
1357 "b", Repetition::REQUIRED,
1358 {GroupNode::Make("list", Repetition::REPEATED,
1359 {PrimitiveNode::Make("element", Repetition::REQUIRED,
1360 ParquetType::INT32)})},
1361 LogicalType::List())}));
1362
1363 // a
1364 ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{4, 5, 6}));
1365 // b
1366 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1367 Int32Vector{7, 8, 9}));
1368
1369 auto type =
1370 struct_({field("a", int64(), /*nullable=*/false),
1371 field("b", List(int32(), /*nullable=*/false), /*nullable=*/false)});
1372 auto expected = ArrayFromJSON(type,
1373 R"([{"a": 4, "b": []},
1374 {"a": 5, "b": [7, 8]},
1375 {"a": 6, "b": [9]}])");
1376 CheckColumn(/*column_index=*/0, *expected);
1377}
1378
1379TEST_F(TestReconstructColumn, NestedTwoFieldsList2) {
1380 // Arrow schema: struct(a: int64 not null,
1381 // b: list(int32 not null)
1382 // ) not null
1383 SetParquetSchema(GroupNode::Make(
1384 "parent", Repetition::REQUIRED,
1385 {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64),
1386 GroupNode::Make(
1387 "b", Repetition::OPTIONAL,
1388 {GroupNode::Make("list", Repetition::REPEATED,
1389 {PrimitiveNode::Make("element", Repetition::REQUIRED,
1390 ParquetType::INT32)})},
1391 LogicalType::List())}));
1392
1393 // a
1394 ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{3, 4, 5, 6}));
1395 // b
1396 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1397 Int32Vector{7, 8, 9}));
1398
1399 auto type = struct_({field("a", int64(), /*nullable=*/false),
1400 field("b", List(int32(), /*nullable=*/false))});
1401 auto expected = ArrayFromJSON(type,
1402 R"([{"a": 3, "b": null},
1403 {"a": 4, "b": []},
1404 {"a": 5, "b": [7, 8]},
1405 {"a": 6, "b": [9]}])");
1406 CheckColumn(/*column_index=*/0, *expected);
1407}
1408
1409TEST_F(TestReconstructColumn, NestedTwoFieldsList3) {
1410 // Arrow schema: struct(a: int64,
1411 // b: list(int32 not null)
1412 // ) not null
1413 SetParquetSchema(GroupNode::Make(
1414 "parent", Repetition::REQUIRED,
1415 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1416 GroupNode::Make(
1417 "b", Repetition::OPTIONAL,
1418 {GroupNode::Make("list", Repetition::REPEATED,
1419 {PrimitiveNode::Make("element", Repetition::REQUIRED,
1420 ParquetType::INT32)})},
1421 LogicalType::List())}));
1422
1423 // a
1424 ASSERT_OK(WriteInt64Column(DefLevels{1, 1, 0, 1}, RepLevels{}, Int64Vector{4, 5, 6}));
1425 // b
1426 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1427 Int32Vector{7, 8, 9}));
1428
1429 auto type =
1430 struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))});
1431 auto expected = ArrayFromJSON(type,
1432 R"([{"a": 4, "b": null},
1433 {"a": 5, "b": []},
1434 {"a": null, "b": [7, 8]},
1435 {"a": 6, "b": [9]}])");
1436 CheckColumn(/*column_index=*/0, *expected);
1437}
1438
1439TEST_F(TestReconstructColumn, NestedTwoFieldsList4) {
1440 // Arrow schema: struct(a: int64,
1441 // b: list(int32 not null)
1442 // )
1443 SetParquetSchema(GroupNode::Make(
1444 "parent", Repetition::OPTIONAL,
1445 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1446 GroupNode::Make(
1447 "b", Repetition::OPTIONAL,
1448 {GroupNode::Make("list", Repetition::REPEATED,
1449 {PrimitiveNode::Make("element", Repetition::REQUIRED,
1450 ParquetType::INT32)})},
1451 LogicalType::List())}));
1452
1453 // a
1454 ASSERT_OK(
1455 WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6}));
1456 // b
1457 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 3, 3, 3}, RepLevels{0, 0, 0, 0, 1, 0},
1458 Int32Vector{7, 8, 9}));
1459
1460 auto type =
1461 struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))});
1462 auto expected = ArrayFromJSON(type,
1463 R"([null,
1464 {"a": 4, "b": null},
1465 {"a": 5, "b": []},
1466 {"a": null, "b": [7, 8]},
1467 {"a": 6, "b": [9]}])");
1468 CheckColumn(/*column_index=*/0, *expected);
1469}
1470
1471TEST_F(TestReconstructColumn, NestedTwoFieldsList5) {
1472 // Arrow schema: struct(a: int64, b: list(int32))
1473 SetParquetSchema(GroupNode::Make(
1474 "parent", Repetition::OPTIONAL,
1475 {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1476 GroupNode::Make(
1477 "b", Repetition::OPTIONAL,
1478 {GroupNode::Make("list", Repetition::REPEATED,
1479 {PrimitiveNode::Make("element", Repetition::OPTIONAL,
1480 ParquetType::INT32)})},
1481 LogicalType::List())}));
1482
1483 // a
1484 ASSERT_OK(
1485 WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6}));
1486 // b
1487 ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 4, 3, 4}, RepLevels{0, 0, 0, 0, 1, 0},
1488 Int32Vector{7, 8}));
1489
1490 auto type = struct_({field("a", int64()), field("b", List(int32()))});
1491 auto expected = ArrayFromJSON(type,
1492 R"([null,
1493 {"a": 4, "b": null},
1494 {"a": 5, "b": []},
1495 {"a": null, "b": [7, null]},
1496 {"a": 6, "b": [8]}])");
1497 CheckColumn(/*column_index=*/0, *expected);
1498}
1499
1500//
1501// List-in-list
1502//
1503
1504TEST_F(TestReconstructColumn, ListList1) {
1505 // Arrow schema: list(list(int32 not null) not null) not null
1506 auto inner_list = GroupNode::Make(
1507 "element", Repetition::REQUIRED,
1508 {GroupNode::Make(
1509 "list", Repetition::REPEATED,
1510 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1511 LogicalType::List());
1512 SetParquetSchema(
1513 GroupNode::Make("parent", Repetition::REQUIRED,
1514 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1515 LogicalType::List()));
1516
1517 LevelVector def_levels = {0, 1, 2, 2, 2};
1518 LevelVector rep_levels = {0, 0, 1, 0, 2};
1519 std::vector<int32_t> values = {4, 5, 6};
1520
1521 auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false);
1522 auto expected = ArrayFromJSON(type, "[[], [[], [4]], [[5, 6]]]");
1523 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1524}
1525
1526TEST_F(TestReconstructColumn, ListList2) {
1527 // Arrow schema: list(list(int32 not null) not null)
1528 auto inner_list = GroupNode::Make(
1529 "element", Repetition::REQUIRED,
1530 {GroupNode::Make(
1531 "list", Repetition::REPEATED,
1532 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1533 LogicalType::List());
1534 SetParquetSchema(
1535 GroupNode::Make("parent", Repetition::OPTIONAL,
1536 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1537 LogicalType::List()));
1538
1539 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1540 LevelVector rep_levels = {0, 0, 0, 1, 0, 2};
1541 std::vector<int32_t> values = {4, 5, 6};
1542
1543 auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false);
1544 auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5, 6]]]");
1545 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1546}
1547
1548TEST_F(TestReconstructColumn, ListList3) {
1549 // Arrow schema: list(list(int32 not null)) not null
1550 auto inner_list = GroupNode::Make(
1551 "element", Repetition::OPTIONAL,
1552 {GroupNode::Make(
1553 "list", Repetition::REPEATED,
1554 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1555 LogicalType::List());
1556 SetParquetSchema(
1557 GroupNode::Make("parent", Repetition::REQUIRED,
1558 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1559 LogicalType::List()));
1560
1561 LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1562 LevelVector rep_levels = {0, 0, 1, 0, 1, 2};
1563 std::vector<int32_t> values = {4, 5, 6};
1564
1565 auto type = List(List(int32(), /*nullable=*/false));
1566 auto expected = ArrayFromJSON(type, "[[], [null, []], [[4], [5, 6]]]");
1567 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1568}
1569
1570TEST_F(TestReconstructColumn, ListList4) {
1571 // Arrow schema: list(list(int32 not null))
1572 auto inner_list = GroupNode::Make(
1573 "element", Repetition::OPTIONAL,
1574 {GroupNode::Make(
1575 "list", Repetition::REPEATED,
1576 {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1577 LogicalType::List());
1578 SetParquetSchema(
1579 GroupNode::Make("parent", Repetition::OPTIONAL,
1580 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1581 LogicalType::List()));
1582
1583 LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4};
1584 LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 2};
1585 std::vector<int32_t> values = {4, 5, 6};
1586
1587 auto type = List(List(int32(), /*nullable=*/false));
1588 auto expected = ArrayFromJSON(type, "[null, [], [null, [], [4]], [[5, 6]]]");
1589 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1590}
1591
1592TEST_F(TestReconstructColumn, ListList5) {
1593 // Arrow schema: list(list(int32) not null)
1594 auto inner_list = GroupNode::Make(
1595 "element", Repetition::REQUIRED,
1596 {GroupNode::Make(
1597 "list", Repetition::REPEATED,
1598 {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
1599 LogicalType::List());
1600 SetParquetSchema(
1601 GroupNode::Make("parent", Repetition::OPTIONAL,
1602 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1603 LogicalType::List()));
1604
1605 LevelVector def_levels = {0, 1, 2, 4, 4, 3, 4};
1606 LevelVector rep_levels = {0, 0, 0, 1, 0, 1, 2};
1607 std::vector<int32_t> values = {4, 5, 6};
1608
1609 auto type = List(List(int32()), /*nullable=*/false);
1610 auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5], [null, 6]]]");
1611 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1612}
1613
1614TEST_F(TestReconstructColumn, ListList6) {
1615 // Arrow schema: list(list(int32))
1616 auto inner_list = GroupNode::Make(
1617 "element", Repetition::OPTIONAL,
1618 {GroupNode::Make(
1619 "list", Repetition::REPEATED,
1620 {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
1621 LogicalType::List());
1622 SetParquetSchema(
1623 GroupNode::Make("parent", Repetition::OPTIONAL,
1624 {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1625 LogicalType::List()));
1626
1627 LevelVector def_levels = {0, 1, 2, 3, 4, 5, 5, 5};
1628 LevelVector rep_levels = {0, 0, 0, 1, 1, 2, 0, 2};
1629 std::vector<int32_t> values = {4, 5, 6};
1630
1631 auto type = List(List(int32()));
1632 auto expected = ArrayFromJSON(type, "[null, [], [null, [], [null, 4]], [[5, 6]]]");
1633 AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1634}
1635
1636// TODO legacy-list-in-struct etc.?
1637
1638} // namespace arrow
1639} // namespace parquet