]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/array/array_nested.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / array / array_nested.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/array/array_nested.h"
19
20#include <cstddef>
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <utility>
25#include <vector>
26
27#include "arrow/array/array_base.h"
28#include "arrow/array/array_primitive.h"
29#include "arrow/array/concatenate.h"
30#include "arrow/array/util.h"
31#include "arrow/buffer.h"
32#include "arrow/status.h"
33#include "arrow/type.h"
34#include "arrow/type_fwd.h"
35#include "arrow/type_traits.h"
36#include "arrow/util/atomic_shared_ptr.h"
37#include "arrow/util/bit_util.h"
38#include "arrow/util/bitmap_ops.h"
39#include "arrow/util/checked_cast.h"
40#include "arrow/util/logging.h"
41
42namespace arrow {
43
44using internal::BitmapAnd;
45using internal::checked_cast;
46using internal::checked_pointer_cast;
47using internal::CopyBitmap;
48
49// ----------------------------------------------------------------------
50// ListArray / LargeListArray
51
52namespace {
53
54template <typename TYPE>
55Status CleanListOffsets(const Array& offsets, MemoryPool* pool,
56 std::shared_ptr<Buffer>* offset_buf_out,
57 std::shared_ptr<Buffer>* validity_buf_out) {
58 using offset_type = typename TYPE::offset_type;
59 using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
60 using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
61
62 const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
63 const int64_t num_offsets = offsets.length();
64
65 if (offsets.null_count() > 0) {
66 if (!offsets.IsValid(num_offsets - 1)) {
67 return Status::Invalid("Last list offset should be non-null");
68 }
69
70 ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
71 AllocateBuffer(num_offsets * sizeof(offset_type), pool));
72
73 // Copy valid bits, ignoring the final offset (since for a length N list array,
74 // we have N + 1 offsets)
75 ARROW_ASSIGN_OR_RAISE(
76 auto clean_valid_bits,
77 offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1)));
78 *validity_buf_out = clean_valid_bits;
79
80 const offset_type* raw_offsets = typed_offsets.raw_values();
81 auto clean_raw_offsets =
82 reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
83
84 // Must work backwards so we can tell how many values were in the last non-null value
85 offset_type current_offset = raw_offsets[num_offsets - 1];
86 for (int64_t i = num_offsets - 1; i >= 0; --i) {
87 if (offsets.IsValid(i)) {
88 current_offset = raw_offsets[i];
89 }
90 clean_raw_offsets[i] = current_offset;
91 }
92
93 *offset_buf_out = std::move(clean_offsets);
94 } else {
95 *validity_buf_out = offsets.null_bitmap();
96 *offset_buf_out = typed_offsets.values();
97 }
98
99 return Status::OK();
100}
101
102template <typename TYPE>
103Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArrays(
104 const Array& offsets, const Array& values, MemoryPool* pool) {
105 using offset_type = typename TYPE::offset_type;
106 using ArrayType = typename TypeTraits<TYPE>::ArrayType;
107 using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
108
109 if (offsets.length() == 0) {
110 return Status::Invalid("List offsets must have non-zero length");
111 }
112
113 if (offsets.type_id() != OffsetArrowType::type_id) {
114 return Status::TypeError("List offsets must be ", OffsetArrowType::type_name());
115 }
116
117 std::shared_ptr<Buffer> offset_buf, validity_buf;
118 RETURN_NOT_OK(CleanListOffsets<TYPE>(offsets, pool, &offset_buf, &validity_buf));
119 BufferVector buffers = {validity_buf, offset_buf};
120
121 auto list_type = std::make_shared<TYPE>(values.type());
122 auto internal_data =
123 ArrayData::Make(list_type, offsets.length() - 1, std::move(buffers),
124 offsets.null_count(), offsets.offset());
125 internal_data->child_data.push_back(values.data());
126
127 return std::make_shared<ArrayType>(internal_data);
128}
129
130static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array, int64_t begin,
131 int64_t end) {
132 return array.Slice(begin, end - begin);
133}
134
135template <typename ListArrayT>
136Result<std::shared_ptr<Array>> FlattenListArray(const ListArrayT& list_array,
137 MemoryPool* memory_pool) {
138 const int64_t list_array_length = list_array.length();
139 std::shared_ptr<arrow::Array> value_array = list_array.values();
140
141 // Shortcut: if a ListArray does not contain nulls, then simply slice its
142 // value array with the first and the last offsets.
143 if (list_array.null_count() == 0) {
144 return SliceArrayWithOffsets(*value_array, list_array.value_offset(0),
145 list_array.value_offset(list_array_length));
146 }
147
148 // The ListArray contains nulls: there may be a non-empty sub-list behind
149 // a null and it must not be contained in the result.
150 std::vector<std::shared_ptr<Array>> non_null_fragments;
151 int64_t valid_begin = 0;
152 while (valid_begin < list_array_length) {
153 int64_t valid_end = valid_begin;
154 while (valid_end < list_array_length &&
155 (list_array.IsValid(valid_end) || list_array.value_length(valid_end) == 0)) {
156 ++valid_end;
157 }
158 if (valid_begin < valid_end) {
159 non_null_fragments.push_back(
160 SliceArrayWithOffsets(*value_array, list_array.value_offset(valid_begin),
161 list_array.value_offset(valid_end)));
162 }
163 valid_begin = valid_end + 1; // skip null entry
164 }
165
166 // Final attempt to avoid invoking Concatenate().
167 if (non_null_fragments.size() == 1) {
168 return non_null_fragments[0];
169 }
170
171 return Concatenate(non_null_fragments, memory_pool);
172}
173
174} // namespace
175
176namespace internal {
177
178template <typename TYPE>
179inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
180 Type::type expected_type_id) {
181 ARROW_CHECK_EQ(data->buffers.size(), 2);
182 ARROW_CHECK_EQ(data->type->id(), expected_type_id);
183 ARROW_CHECK_EQ(data->child_data.size(), 1);
184
185 self->Array::SetData(data);
186
187 self->list_type_ = checked_cast<const TYPE*>(data->type.get());
188 self->raw_value_offsets_ =
189 data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);
190
191 ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
192 DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
193 self->values_ = MakeArray(self->data_->child_data[0]);
194}
195
196} // namespace internal
197
198ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); }
199
200LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
201
202ListArray::ListArray(std::shared_ptr<DataType> type, int64_t length,
203 std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
204 std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
205 int64_t offset) {
206 ARROW_CHECK_EQ(type->id(), Type::LIST);
207 auto internal_data = ArrayData::Make(
208 std::move(type), length,
209 BufferVector{std::move(null_bitmap), std::move(value_offsets)}, null_count, offset);
210 internal_data->child_data.emplace_back(values->data());
211 SetData(std::move(internal_data));
212}
213
214void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
215 internal::SetListData(this, data);
216}
217
218LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
219 const std::shared_ptr<Buffer>& value_offsets,
220 const std::shared_ptr<Array>& values,
221 const std::shared_ptr<Buffer>& null_bitmap,
222 int64_t null_count, int64_t offset) {
223 ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST);
224 auto internal_data =
225 ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
226 internal_data->child_data.emplace_back(values->data());
227 SetData(internal_data);
228}
229
230void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
231 internal::SetListData(this, data);
232}
233
234Result<std::shared_ptr<ListArray>> ListArray::FromArrays(const Array& offsets,
235 const Array& values,
236 MemoryPool* pool) {
237 return ListArrayFromArrays<ListType>(offsets, values, pool);
238}
239
240Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(const Array& offsets,
241 const Array& values,
242 MemoryPool* pool) {
243 return ListArrayFromArrays<LargeListType>(offsets, values, pool);
244}
245
246Result<std::shared_ptr<Array>> ListArray::Flatten(MemoryPool* memory_pool) const {
247 return FlattenListArray(*this, memory_pool);
248}
249
250Result<std::shared_ptr<Array>> LargeListArray::Flatten(MemoryPool* memory_pool) const {
251 return FlattenListArray(*this, memory_pool);
252}
253
254static std::shared_ptr<Array> BoxOffsets(const std::shared_ptr<DataType>& boxed_type,
255 const ArrayData& data) {
256 std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[1]};
257 auto offsets_data =
258 std::make_shared<ArrayData>(boxed_type, data.length + 1, std::move(buffers),
259 /*null_count=*/0, data.offset);
260 return MakeArray(offsets_data);
261}
262
263std::shared_ptr<Array> ListArray::offsets() const { return BoxOffsets(int32(), *data_); }
264
265std::shared_ptr<Array> LargeListArray::offsets() const {
266 return BoxOffsets(int64(), *data_);
267}
268
269// ----------------------------------------------------------------------
270// MapArray
271
272MapArray::MapArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
273
274MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
275 const std::shared_ptr<Buffer>& offsets,
276 const std::shared_ptr<Array>& values,
277 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
278 int64_t offset) {
279 SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()},
280 null_count, offset));
281}
282
283MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
284 const std::shared_ptr<Buffer>& offsets,
285 const std::shared_ptr<Array>& keys,
286 const std::shared_ptr<Array>& items,
287 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
288 int64_t offset) {
289 auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length,
290 {nullptr}, {keys->data(), items->data()}, 0, offset);
291 auto map_data = ArrayData::Make(type, length, {null_bitmap, offsets}, {pair_data},
292 null_count, offset);
293 SetData(map_data);
294}
295
296Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
297 std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
298 const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
299 MemoryPool* pool) {
300 using offset_type = typename MapType::offset_type;
301 using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
302
303 if (offsets->length() == 0) {
304 return Status::Invalid("Map offsets must have non-zero length");
305 }
306
307 if (offsets->type_id() != OffsetArrowType::type_id) {
308 return Status::TypeError("Map offsets must be ", OffsetArrowType::type_name());
309 }
310
311 if (keys->null_count() != 0) {
312 return Status::Invalid("Map can not contain NULL valued keys");
313 }
314
315 if (keys->length() != items->length()) {
316 return Status::Invalid("Map key and item arrays must be equal length");
317 }
318
319 std::shared_ptr<Buffer> offset_buf, validity_buf;
320 RETURN_NOT_OK(CleanListOffsets<MapType>(*offsets, pool, &offset_buf, &validity_buf));
321
322 return std::make_shared<MapArray>(type, offsets->length() - 1, offset_buf, keys, items,
323 validity_buf, offsets->null_count(),
324 offsets->offset());
325}
326
327Result<std::shared_ptr<Array>> MapArray::FromArrays(const std::shared_ptr<Array>& offsets,
328 const std::shared_ptr<Array>& keys,
329 const std::shared_ptr<Array>& items,
330 MemoryPool* pool) {
331 return FromArraysInternal(std::make_shared<MapType>(keys->type(), items->type()),
332 offsets, keys, items, pool);
333}
334
335Result<std::shared_ptr<Array>> MapArray::FromArrays(std::shared_ptr<DataType> type,
336 const std::shared_ptr<Array>& offsets,
337 const std::shared_ptr<Array>& keys,
338 const std::shared_ptr<Array>& items,
339 MemoryPool* pool) {
340 if (type->id() != Type::MAP) {
341 return Status::TypeError("Expected map type, got ", type->ToString());
342 }
343 const auto& map_type = checked_cast<const MapType&>(*type);
344 if (!map_type.key_type()->Equals(keys->type())) {
345 return Status::TypeError("Mismatching map keys type");
346 }
347 if (!map_type.item_type()->Equals(items->type())) {
348 return Status::TypeError("Mismatching map items type");
349 }
350 return FromArraysInternal(std::move(type), offsets, keys, items, pool);
351}
352
353Status MapArray::ValidateChildData(
354 const std::vector<std::shared_ptr<ArrayData>>& child_data) {
355 if (child_data.size() != 1) {
356 return Status::Invalid("Expected one child array for map array");
357 }
358 const auto& pair_data = child_data[0];
359 if (pair_data->type->id() != Type::STRUCT) {
360 return Status::Invalid("Map array child array should have struct type");
361 }
362 if (pair_data->null_count != 0) {
363 return Status::Invalid("Map array child array should have no nulls");
364 }
365 if (pair_data->child_data.size() != 2) {
366 return Status::Invalid("Map array child array should have two fields");
367 }
368 if (pair_data->child_data[0]->null_count != 0) {
369 return Status::Invalid("Map array keys array should have no nulls");
370 }
371 return Status::OK();
372}
373
374void MapArray::SetData(const std::shared_ptr<ArrayData>& data) {
375 ARROW_CHECK_OK(ValidateChildData(data->child_data));
376
377 internal::SetListData(this, data, Type::MAP);
378 map_type_ = checked_cast<const MapType*>(data->type.get());
379 const auto& pair_data = data->child_data[0];
380 keys_ = MakeArray(pair_data->child_data[0]);
381 items_ = MakeArray(pair_data->child_data[1]);
382}
383
384// ----------------------------------------------------------------------
385// FixedSizeListArray
386
387FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<ArrayData>& data) {
388 SetData(data);
389}
390
391FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<DataType>& type,
392 int64_t length,
393 const std::shared_ptr<Array>& values,
394 const std::shared_ptr<Buffer>& null_bitmap,
395 int64_t null_count, int64_t offset) {
396 auto internal_data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset);
397 internal_data->child_data.emplace_back(values->data());
398 SetData(internal_data);
399}
400
401void FixedSizeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
402 ARROW_CHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST);
403 this->Array::SetData(data);
404
405 ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id());
406 DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type));
407 list_size_ = list_type()->list_size();
408
409 ARROW_CHECK_EQ(data_->child_data.size(), 1);
410 values_ = MakeArray(data_->child_data[0]);
411}
412
413const FixedSizeListType* FixedSizeListArray::list_type() const {
414 return checked_cast<const FixedSizeListType*>(data_->type.get());
415}
416
417std::shared_ptr<DataType> FixedSizeListArray::value_type() const {
418 return list_type()->value_type();
419}
420
421std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; }
422
423Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
424 const std::shared_ptr<Array>& values, int32_t list_size) {
425 if (list_size <= 0) {
426 return Status::Invalid("list_size needs to be a strict positive integer");
427 }
428
429 if ((values->length() % list_size) != 0) {
430 return Status::Invalid(
431 "The length of the values Array needs to be a multiple of the list_size");
432 }
433 int64_t length = values->length() / list_size;
434 auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size);
435 std::shared_ptr<Buffer> validity_buf;
436
437 return std::make_shared<FixedSizeListArray>(list_type, length, values, validity_buf,
438 /*null_count=*/0, /*offset=*/0);
439}
440
441Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
442 MemoryPool* memory_pool) const {
443 return FlattenListArray(*this, memory_pool);
444}
445
446// ----------------------------------------------------------------------
447// Struct
448
449StructArray::StructArray(const std::shared_ptr<ArrayData>& data) {
450 ARROW_CHECK_EQ(data->type->id(), Type::STRUCT);
451 SetData(data);
452 boxed_fields_.resize(data->child_data.size());
453}
454
455StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
456 const std::vector<std::shared_ptr<Array>>& children,
457 std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
458 int64_t offset) {
459 ARROW_CHECK_EQ(type->id(), Type::STRUCT);
460 SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset));
461 for (const auto& child : children) {
462 data_->child_data.push_back(child->data());
463 }
464 boxed_fields_.resize(children.size());
465}
466
467Result<std::shared_ptr<StructArray>> StructArray::Make(
468 const std::vector<std::shared_ptr<Array>>& children,
469 const std::vector<std::shared_ptr<Field>>& fields,
470 std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
471 if (children.size() != fields.size()) {
472 return Status::Invalid("Mismatching number of fields and child arrays");
473 }
474 int64_t length = 0;
475 if (children.size() == 0) {
476 return Status::Invalid("Can't infer struct array length with 0 child arrays");
477 }
478 length = children.front()->length();
479 for (const auto& child : children) {
480 if (length != child->length()) {
481 return Status::Invalid("Mismatching child array lengths");
482 }
483 }
484 if (offset > length) {
485 return Status::IndexError("Offset greater than length of child arrays");
486 }
487 if (null_bitmap == nullptr) {
488 if (null_count > 0) {
489 return Status::Invalid("null_count = ", null_count, " but no null bitmap given");
490 }
491 null_count = 0;
492 }
493 return std::make_shared<StructArray>(struct_(fields), length - offset, children,
494 null_bitmap, null_count, offset);
495}
496
497Result<std::shared_ptr<StructArray>> StructArray::Make(
498 const std::vector<std::shared_ptr<Array>>& children,
499 const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
500 int64_t null_count, int64_t offset) {
501 if (children.size() != field_names.size()) {
502 return Status::Invalid("Mismatching number of field names and child arrays");
503 }
504 std::vector<std::shared_ptr<Field>> fields(children.size());
505 for (size_t i = 0; i < children.size(); ++i) {
506 fields[i] = ::arrow::field(field_names[i], children[i]->type());
507 }
508 return Make(children, fields, std::move(null_bitmap), null_count, offset);
509}
510
511const StructType* StructArray::struct_type() const {
512 return checked_cast<const StructType*>(data_->type.get());
513}
514
515const ArrayVector& StructArray::fields() const {
516 for (int i = 0; i < num_fields(); ++i) {
517 (void)field(i);
518 }
519 return boxed_fields_;
520}
521
522std::shared_ptr<Array> StructArray::field(int i) const {
523 std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
524 if (!result) {
525 std::shared_ptr<ArrayData> field_data;
526 if (data_->offset != 0 || data_->child_data[i]->length != data_->length) {
527 field_data = data_->child_data[i]->Slice(data_->offset, data_->length);
528 } else {
529 field_data = data_->child_data[i];
530 }
531 result = MakeArray(field_data);
532 internal::atomic_store(&boxed_fields_[i], result);
533 }
534 return result;
535}
536
537std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
538 int i = struct_type()->GetFieldIndex(name);
539 return i == -1 ? nullptr : field(i);
540}
541
542Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
543 ArrayVector flattened;
544 flattened.reserve(data_->child_data.size());
545 std::shared_ptr<Buffer> null_bitmap = data_->buffers[0];
546
547 for (const auto& child_data_ptr : data_->child_data) {
548 auto child_data = child_data_ptr->Copy();
549
550 std::shared_ptr<Buffer> flattened_null_bitmap;
551 int64_t flattened_null_count = kUnknownNullCount;
552
553 // Need to adjust for parent offset
554 if (data_->offset != 0 || data_->length != child_data->length) {
555 child_data = child_data->Slice(data_->offset, data_->length);
556 }
557 std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
558 const int64_t child_offset = child_data->offset;
559
560 // The validity of a flattened datum is the logical AND of the struct
561 // element's validity and the individual field element's validity.
562 if (null_bitmap && child_null_bitmap) {
563 ARROW_ASSIGN_OR_RAISE(
564 flattened_null_bitmap,
565 BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_,
566 data_->offset, data_->length, child_offset));
567 } else if (child_null_bitmap) {
568 flattened_null_bitmap = child_null_bitmap;
569 flattened_null_count = child_data->null_count;
570 } else if (null_bitmap) {
571 if (child_offset == data_->offset) {
572 flattened_null_bitmap = null_bitmap;
573 } else {
574 // If the child has an offset, need to synthesize a validity
575 // buffer with an offset too
576 ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap,
577 AllocateEmptyBitmap(child_offset + data_->length, pool));
578 CopyBitmap(null_bitmap_data_, data_->offset, data_->length,
579 flattened_null_bitmap->mutable_data(), child_offset);
580 }
581 flattened_null_count = data_->null_count;
582 } else {
583 flattened_null_count = 0;
584 }
585
586 auto flattened_data = child_data->Copy();
587 flattened_data->buffers[0] = flattened_null_bitmap;
588 flattened_data->null_count = flattened_null_count;
589
590 flattened.push_back(MakeArray(flattened_data));
591 }
592
593 return flattened;
594}
595
596// ----------------------------------------------------------------------
597// UnionArray
598
599void UnionArray::SetData(std::shared_ptr<ArrayData> data) {
600 this->Array::SetData(std::move(data));
601
602 union_type_ = checked_cast<const UnionType*>(data_->type.get());
603
604 ARROW_CHECK_GE(data_->buffers.size(), 2);
605 raw_type_codes_ = data->GetValuesSafe<int8_t>(1, /*offset=*/0);
606 boxed_fields_.resize(data_->child_data.size());
607}
608
609void SparseUnionArray::SetData(std::shared_ptr<ArrayData> data) {
610 this->UnionArray::SetData(std::move(data));
611 ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION);
612 ARROW_CHECK_EQ(data_->buffers.size(), 2);
613
614 // No validity bitmap
615 ARROW_CHECK_EQ(data_->buffers[0], nullptr);
616}
617
618void DenseUnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
619 this->UnionArray::SetData(std::move(data));
620
621 ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION);
622 ARROW_CHECK_EQ(data_->buffers.size(), 3);
623
624 // No validity bitmap
625 ARROW_CHECK_EQ(data_->buffers[0], nullptr);
626
627 raw_value_offsets_ = data->GetValuesSafe<int32_t>(2, /*offset=*/0);
628}
629
630SparseUnionArray::SparseUnionArray(std::shared_ptr<ArrayData> data) {
631 SetData(std::move(data));
632}
633
634SparseUnionArray::SparseUnionArray(std::shared_ptr<DataType> type, int64_t length,
635 ArrayVector children,
636 std::shared_ptr<Buffer> type_codes, int64_t offset) {
637 auto internal_data = ArrayData::Make(std::move(type), length,
638 BufferVector{nullptr, std::move(type_codes)},
639 /*null_count=*/0, offset);
640 for (const auto& child : children) {
641 internal_data->child_data.push_back(child->data());
642 }
643 SetData(std::move(internal_data));
644}
645
646DenseUnionArray::DenseUnionArray(const std::shared_ptr<ArrayData>& data) {
647 SetData(data);
648}
649
650DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length,
651 ArrayVector children, std::shared_ptr<Buffer> type_ids,
652 std::shared_ptr<Buffer> value_offsets, int64_t offset) {
653 auto internal_data = ArrayData::Make(
654 std::move(type), length,
655 BufferVector{nullptr, std::move(type_ids), std::move(value_offsets)},
656 /*null_count=*/0, offset);
657 for (const auto& child : children) {
658 internal_data->child_data.push_back(child->data());
659 }
660 SetData(internal_data);
661}
662
663Result<std::shared_ptr<Array>> DenseUnionArray::Make(
664 const Array& type_ids, const Array& value_offsets, ArrayVector children,
665 std::vector<std::string> field_names, std::vector<type_code_t> type_codes) {
666 if (value_offsets.length() == 0) {
667 return Status::Invalid("UnionArray offsets must have non-zero length");
668 }
669
670 if (value_offsets.type_id() != Type::INT32) {
671 return Status::TypeError("UnionArray offsets must be signed int32");
672 }
673
674 if (type_ids.type_id() != Type::INT8) {
675 return Status::TypeError("UnionArray type_ids must be signed int8");
676 }
677
678 if (type_ids.null_count() != 0) {
679 return Status::Invalid("Union type ids may not have nulls");
680 }
681
682 if (value_offsets.null_count() != 0) {
683 return Status::Invalid("Make does not allow nulls in value_offsets");
684 }
685
686 if (field_names.size() > 0 && field_names.size() != children.size()) {
687 return Status::Invalid("field_names must have the same length as children");
688 }
689
690 if (type_codes.size() > 0 && type_codes.size() != children.size()) {
691 return Status::Invalid("type_codes must have the same length as children");
692 }
693
694 BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values(),
695 checked_cast<const Int32Array&>(value_offsets).values()};
696
697 auto union_type = dense_union(children, std::move(field_names), std::move(type_codes));
698 auto internal_data =
699 ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
700 /*null_count=*/0, type_ids.offset());
701 for (const auto& child : children) {
702 internal_data->child_data.push_back(child->data());
703 }
704 return std::make_shared<DenseUnionArray>(std::move(internal_data));
705}
706
707Result<std::shared_ptr<Array>> SparseUnionArray::Make(
708 const Array& type_ids, ArrayVector children, std::vector<std::string> field_names,
709 std::vector<int8_t> type_codes) {
710 if (type_ids.type_id() != Type::INT8) {
711 return Status::TypeError("UnionArray type_ids must be signed int8");
712 }
713
714 if (type_ids.null_count() != 0) {
715 return Status::Invalid("Union type ids may not have nulls");
716 }
717
718 if (field_names.size() > 0 && field_names.size() != children.size()) {
719 return Status::Invalid("field_names must have the same length as children");
720 }
721
722 if (type_codes.size() > 0 && type_codes.size() != children.size()) {
723 return Status::Invalid("type_codes must have the same length as children");
724 }
725
726 BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values()};
727 auto union_type = sparse_union(children, std::move(field_names), std::move(type_codes));
728 auto internal_data =
729 ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
730 /*null_count=*/0, type_ids.offset());
731 for (const auto& child : children) {
732 internal_data->child_data.push_back(child->data());
733 if (child->length() != type_ids.length()) {
734 return Status::Invalid(
735 "Sparse UnionArray must have len(child) == len(type_ids) for all children");
736 }
737 }
738 return std::make_shared<SparseUnionArray>(std::move(internal_data));
739}
740
741std::shared_ptr<Array> UnionArray::field(int i) const {
742 if (i < 0 ||
743 static_cast<decltype(boxed_fields_)::size_type>(i) >= boxed_fields_.size()) {
744 return nullptr;
745 }
746 std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
747 if (!result) {
748 std::shared_ptr<ArrayData> child_data = data_->child_data[i]->Copy();
749 if (mode() == UnionMode::SPARSE) {
750 // Sparse union: need to adjust child if union is sliced
751 // (for dense unions, the need to lookup through the offsets
752 // makes this unnecessary)
753 if (data_->offset != 0 || child_data->length > data_->length) {
754 child_data = child_data->Slice(data_->offset, data_->length);
755 }
756 }
757 result = MakeArray(child_data);
758 internal::atomic_store(&boxed_fields_[i], result);
759 }
760 return result;
761}
762
763} // namespace arrow