]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "arrow/array/array_nested.h" | |
19 | ||
20 | #include <cstddef> | |
21 | #include <cstdint> | |
22 | #include <memory> | |
23 | #include <string> | |
24 | #include <utility> | |
25 | #include <vector> | |
26 | ||
27 | #include "arrow/array/array_base.h" | |
28 | #include "arrow/array/array_primitive.h" | |
29 | #include "arrow/array/concatenate.h" | |
30 | #include "arrow/array/util.h" | |
31 | #include "arrow/buffer.h" | |
32 | #include "arrow/status.h" | |
33 | #include "arrow/type.h" | |
34 | #include "arrow/type_fwd.h" | |
35 | #include "arrow/type_traits.h" | |
36 | #include "arrow/util/atomic_shared_ptr.h" | |
37 | #include "arrow/util/bit_util.h" | |
38 | #include "arrow/util/bitmap_ops.h" | |
39 | #include "arrow/util/checked_cast.h" | |
40 | #include "arrow/util/logging.h" | |
41 | ||
42 | namespace arrow { | |
43 | ||
44 | using internal::BitmapAnd; | |
45 | using internal::checked_cast; | |
46 | using internal::checked_pointer_cast; | |
47 | using internal::CopyBitmap; | |
48 | ||
49 | // ---------------------------------------------------------------------- | |
50 | // ListArray / LargeListArray | |
51 | ||
52 | namespace { | |
53 | ||
54 | template <typename TYPE> | |
55 | Status CleanListOffsets(const Array& offsets, MemoryPool* pool, | |
56 | std::shared_ptr<Buffer>* offset_buf_out, | |
57 | std::shared_ptr<Buffer>* validity_buf_out) { | |
58 | using offset_type = typename TYPE::offset_type; | |
59 | using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; | |
60 | using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; | |
61 | ||
62 | const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets); | |
63 | const int64_t num_offsets = offsets.length(); | |
64 | ||
65 | if (offsets.null_count() > 0) { | |
66 | if (!offsets.IsValid(num_offsets - 1)) { | |
67 | return Status::Invalid("Last list offset should be non-null"); | |
68 | } | |
69 | ||
70 | ARROW_ASSIGN_OR_RAISE(auto clean_offsets, | |
71 | AllocateBuffer(num_offsets * sizeof(offset_type), pool)); | |
72 | ||
73 | // Copy valid bits, ignoring the final offset (since for a length N list array, | |
74 | // we have N + 1 offsets) | |
75 | ARROW_ASSIGN_OR_RAISE( | |
76 | auto clean_valid_bits, | |
77 | offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1))); | |
78 | *validity_buf_out = clean_valid_bits; | |
79 | ||
80 | const offset_type* raw_offsets = typed_offsets.raw_values(); | |
81 | auto clean_raw_offsets = | |
82 | reinterpret_cast<offset_type*>(clean_offsets->mutable_data()); | |
83 | ||
84 | // Must work backwards so we can tell how many values were in the last non-null value | |
85 | offset_type current_offset = raw_offsets[num_offsets - 1]; | |
86 | for (int64_t i = num_offsets - 1; i >= 0; --i) { | |
87 | if (offsets.IsValid(i)) { | |
88 | current_offset = raw_offsets[i]; | |
89 | } | |
90 | clean_raw_offsets[i] = current_offset; | |
91 | } | |
92 | ||
93 | *offset_buf_out = std::move(clean_offsets); | |
94 | } else { | |
95 | *validity_buf_out = offsets.null_bitmap(); | |
96 | *offset_buf_out = typed_offsets.values(); | |
97 | } | |
98 | ||
99 | return Status::OK(); | |
100 | } | |
101 | ||
102 | template <typename TYPE> | |
103 | Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArrays( | |
104 | const Array& offsets, const Array& values, MemoryPool* pool) { | |
105 | using offset_type = typename TYPE::offset_type; | |
106 | using ArrayType = typename TypeTraits<TYPE>::ArrayType; | |
107 | using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; | |
108 | ||
109 | if (offsets.length() == 0) { | |
110 | return Status::Invalid("List offsets must have non-zero length"); | |
111 | } | |
112 | ||
113 | if (offsets.type_id() != OffsetArrowType::type_id) { | |
114 | return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); | |
115 | } | |
116 | ||
117 | std::shared_ptr<Buffer> offset_buf, validity_buf; | |
118 | RETURN_NOT_OK(CleanListOffsets<TYPE>(offsets, pool, &offset_buf, &validity_buf)); | |
119 | BufferVector buffers = {validity_buf, offset_buf}; | |
120 | ||
121 | auto list_type = std::make_shared<TYPE>(values.type()); | |
122 | auto internal_data = | |
123 | ArrayData::Make(list_type, offsets.length() - 1, std::move(buffers), | |
124 | offsets.null_count(), offsets.offset()); | |
125 | internal_data->child_data.push_back(values.data()); | |
126 | ||
127 | return std::make_shared<ArrayType>(internal_data); | |
128 | } | |
129 | ||
130 | static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array, int64_t begin, | |
131 | int64_t end) { | |
132 | return array.Slice(begin, end - begin); | |
133 | } | |
134 | ||
135 | template <typename ListArrayT> | |
136 | Result<std::shared_ptr<Array>> FlattenListArray(const ListArrayT& list_array, | |
137 | MemoryPool* memory_pool) { | |
138 | const int64_t list_array_length = list_array.length(); | |
139 | std::shared_ptr<arrow::Array> value_array = list_array.values(); | |
140 | ||
141 | // Shortcut: if a ListArray does not contain nulls, then simply slice its | |
142 | // value array with the first and the last offsets. | |
143 | if (list_array.null_count() == 0) { | |
144 | return SliceArrayWithOffsets(*value_array, list_array.value_offset(0), | |
145 | list_array.value_offset(list_array_length)); | |
146 | } | |
147 | ||
148 | // The ListArray contains nulls: there may be a non-empty sub-list behind | |
149 | // a null and it must not be contained in the result. | |
150 | std::vector<std::shared_ptr<Array>> non_null_fragments; | |
151 | int64_t valid_begin = 0; | |
152 | while (valid_begin < list_array_length) { | |
153 | int64_t valid_end = valid_begin; | |
154 | while (valid_end < list_array_length && | |
155 | (list_array.IsValid(valid_end) || list_array.value_length(valid_end) == 0)) { | |
156 | ++valid_end; | |
157 | } | |
158 | if (valid_begin < valid_end) { | |
159 | non_null_fragments.push_back( | |
160 | SliceArrayWithOffsets(*value_array, list_array.value_offset(valid_begin), | |
161 | list_array.value_offset(valid_end))); | |
162 | } | |
163 | valid_begin = valid_end + 1; // skip null entry | |
164 | } | |
165 | ||
166 | // Final attempt to avoid invoking Concatenate(). | |
167 | if (non_null_fragments.size() == 1) { | |
168 | return non_null_fragments[0]; | |
169 | } | |
170 | ||
171 | return Concatenate(non_null_fragments, memory_pool); | |
172 | } | |
173 | ||
174 | } // namespace | |
175 | ||
176 | namespace internal { | |
177 | ||
178 | template <typename TYPE> | |
179 | inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data, | |
180 | Type::type expected_type_id) { | |
181 | ARROW_CHECK_EQ(data->buffers.size(), 2); | |
182 | ARROW_CHECK_EQ(data->type->id(), expected_type_id); | |
183 | ARROW_CHECK_EQ(data->child_data.size(), 1); | |
184 | ||
185 | self->Array::SetData(data); | |
186 | ||
187 | self->list_type_ = checked_cast<const TYPE*>(data->type.get()); | |
188 | self->raw_value_offsets_ = | |
189 | data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0); | |
190 | ||
191 | ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); | |
192 | DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); | |
193 | self->values_ = MakeArray(self->data_->child_data[0]); | |
194 | } | |
195 | ||
196 | } // namespace internal | |
197 | ||
198 | ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); } | |
199 | ||
200 | LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); } | |
201 | ||
202 | ListArray::ListArray(std::shared_ptr<DataType> type, int64_t length, | |
203 | std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values, | |
204 | std::shared_ptr<Buffer> null_bitmap, int64_t null_count, | |
205 | int64_t offset) { | |
206 | ARROW_CHECK_EQ(type->id(), Type::LIST); | |
207 | auto internal_data = ArrayData::Make( | |
208 | std::move(type), length, | |
209 | BufferVector{std::move(null_bitmap), std::move(value_offsets)}, null_count, offset); | |
210 | internal_data->child_data.emplace_back(values->data()); | |
211 | SetData(std::move(internal_data)); | |
212 | } | |
213 | ||
214 | void ListArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
215 | internal::SetListData(this, data); | |
216 | } | |
217 | ||
218 | LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length, | |
219 | const std::shared_ptr<Buffer>& value_offsets, | |
220 | const std::shared_ptr<Array>& values, | |
221 | const std::shared_ptr<Buffer>& null_bitmap, | |
222 | int64_t null_count, int64_t offset) { | |
223 | ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST); | |
224 | auto internal_data = | |
225 | ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); | |
226 | internal_data->child_data.emplace_back(values->data()); | |
227 | SetData(internal_data); | |
228 | } | |
229 | ||
230 | void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
231 | internal::SetListData(this, data); | |
232 | } | |
233 | ||
234 | Result<std::shared_ptr<ListArray>> ListArray::FromArrays(const Array& offsets, | |
235 | const Array& values, | |
236 | MemoryPool* pool) { | |
237 | return ListArrayFromArrays<ListType>(offsets, values, pool); | |
238 | } | |
239 | ||
240 | Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(const Array& offsets, | |
241 | const Array& values, | |
242 | MemoryPool* pool) { | |
243 | return ListArrayFromArrays<LargeListType>(offsets, values, pool); | |
244 | } | |
245 | ||
246 | Result<std::shared_ptr<Array>> ListArray::Flatten(MemoryPool* memory_pool) const { | |
247 | return FlattenListArray(*this, memory_pool); | |
248 | } | |
249 | ||
250 | Result<std::shared_ptr<Array>> LargeListArray::Flatten(MemoryPool* memory_pool) const { | |
251 | return FlattenListArray(*this, memory_pool); | |
252 | } | |
253 | ||
254 | static std::shared_ptr<Array> BoxOffsets(const std::shared_ptr<DataType>& boxed_type, | |
255 | const ArrayData& data) { | |
256 | std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[1]}; | |
257 | auto offsets_data = | |
258 | std::make_shared<ArrayData>(boxed_type, data.length + 1, std::move(buffers), | |
259 | /*null_count=*/0, data.offset); | |
260 | return MakeArray(offsets_data); | |
261 | } | |
262 | ||
263 | std::shared_ptr<Array> ListArray::offsets() const { return BoxOffsets(int32(), *data_); } | |
264 | ||
265 | std::shared_ptr<Array> LargeListArray::offsets() const { | |
266 | return BoxOffsets(int64(), *data_); | |
267 | } | |
268 | ||
269 | // ---------------------------------------------------------------------- | |
270 | // MapArray | |
271 | ||
272 | MapArray::MapArray(const std::shared_ptr<ArrayData>& data) { SetData(data); } | |
273 | ||
274 | MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length, | |
275 | const std::shared_ptr<Buffer>& offsets, | |
276 | const std::shared_ptr<Array>& values, | |
277 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, | |
278 | int64_t offset) { | |
279 | SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()}, | |
280 | null_count, offset)); | |
281 | } | |
282 | ||
283 | MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length, | |
284 | const std::shared_ptr<Buffer>& offsets, | |
285 | const std::shared_ptr<Array>& keys, | |
286 | const std::shared_ptr<Array>& items, | |
287 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, | |
288 | int64_t offset) { | |
289 | auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length, | |
290 | {nullptr}, {keys->data(), items->data()}, 0, offset); | |
291 | auto map_data = ArrayData::Make(type, length, {null_bitmap, offsets}, {pair_data}, | |
292 | null_count, offset); | |
293 | SetData(map_data); | |
294 | } | |
295 | ||
296 | Result<std::shared_ptr<Array>> MapArray::FromArraysInternal( | |
297 | std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets, | |
298 | const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items, | |
299 | MemoryPool* pool) { | |
300 | using offset_type = typename MapType::offset_type; | |
301 | using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; | |
302 | ||
303 | if (offsets->length() == 0) { | |
304 | return Status::Invalid("Map offsets must have non-zero length"); | |
305 | } | |
306 | ||
307 | if (offsets->type_id() != OffsetArrowType::type_id) { | |
308 | return Status::TypeError("Map offsets must be ", OffsetArrowType::type_name()); | |
309 | } | |
310 | ||
311 | if (keys->null_count() != 0) { | |
312 | return Status::Invalid("Map can not contain NULL valued keys"); | |
313 | } | |
314 | ||
315 | if (keys->length() != items->length()) { | |
316 | return Status::Invalid("Map key and item arrays must be equal length"); | |
317 | } | |
318 | ||
319 | std::shared_ptr<Buffer> offset_buf, validity_buf; | |
320 | RETURN_NOT_OK(CleanListOffsets<MapType>(*offsets, pool, &offset_buf, &validity_buf)); | |
321 | ||
322 | return std::make_shared<MapArray>(type, offsets->length() - 1, offset_buf, keys, items, | |
323 | validity_buf, offsets->null_count(), | |
324 | offsets->offset()); | |
325 | } | |
326 | ||
327 | Result<std::shared_ptr<Array>> MapArray::FromArrays(const std::shared_ptr<Array>& offsets, | |
328 | const std::shared_ptr<Array>& keys, | |
329 | const std::shared_ptr<Array>& items, | |
330 | MemoryPool* pool) { | |
331 | return FromArraysInternal(std::make_shared<MapType>(keys->type(), items->type()), | |
332 | offsets, keys, items, pool); | |
333 | } | |
334 | ||
335 | Result<std::shared_ptr<Array>> MapArray::FromArrays(std::shared_ptr<DataType> type, | |
336 | const std::shared_ptr<Array>& offsets, | |
337 | const std::shared_ptr<Array>& keys, | |
338 | const std::shared_ptr<Array>& items, | |
339 | MemoryPool* pool) { | |
340 | if (type->id() != Type::MAP) { | |
341 | return Status::TypeError("Expected map type, got ", type->ToString()); | |
342 | } | |
343 | const auto& map_type = checked_cast<const MapType&>(*type); | |
344 | if (!map_type.key_type()->Equals(keys->type())) { | |
345 | return Status::TypeError("Mismatching map keys type"); | |
346 | } | |
347 | if (!map_type.item_type()->Equals(items->type())) { | |
348 | return Status::TypeError("Mismatching map items type"); | |
349 | } | |
350 | return FromArraysInternal(std::move(type), offsets, keys, items, pool); | |
351 | } | |
352 | ||
353 | Status MapArray::ValidateChildData( | |
354 | const std::vector<std::shared_ptr<ArrayData>>& child_data) { | |
355 | if (child_data.size() != 1) { | |
356 | return Status::Invalid("Expected one child array for map array"); | |
357 | } | |
358 | const auto& pair_data = child_data[0]; | |
359 | if (pair_data->type->id() != Type::STRUCT) { | |
360 | return Status::Invalid("Map array child array should have struct type"); | |
361 | } | |
362 | if (pair_data->null_count != 0) { | |
363 | return Status::Invalid("Map array child array should have no nulls"); | |
364 | } | |
365 | if (pair_data->child_data.size() != 2) { | |
366 | return Status::Invalid("Map array child array should have two fields"); | |
367 | } | |
368 | if (pair_data->child_data[0]->null_count != 0) { | |
369 | return Status::Invalid("Map array keys array should have no nulls"); | |
370 | } | |
371 | return Status::OK(); | |
372 | } | |
373 | ||
374 | void MapArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
375 | ARROW_CHECK_OK(ValidateChildData(data->child_data)); | |
376 | ||
377 | internal::SetListData(this, data, Type::MAP); | |
378 | map_type_ = checked_cast<const MapType*>(data->type.get()); | |
379 | const auto& pair_data = data->child_data[0]; | |
380 | keys_ = MakeArray(pair_data->child_data[0]); | |
381 | items_ = MakeArray(pair_data->child_data[1]); | |
382 | } | |
383 | ||
384 | // ---------------------------------------------------------------------- | |
385 | // FixedSizeListArray | |
386 | ||
387 | FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<ArrayData>& data) { | |
388 | SetData(data); | |
389 | } | |
390 | ||
391 | FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<DataType>& type, | |
392 | int64_t length, | |
393 | const std::shared_ptr<Array>& values, | |
394 | const std::shared_ptr<Buffer>& null_bitmap, | |
395 | int64_t null_count, int64_t offset) { | |
396 | auto internal_data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset); | |
397 | internal_data->child_data.emplace_back(values->data()); | |
398 | SetData(internal_data); | |
399 | } | |
400 | ||
401 | void FixedSizeListArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
402 | ARROW_CHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST); | |
403 | this->Array::SetData(data); | |
404 | ||
405 | ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id()); | |
406 | DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type)); | |
407 | list_size_ = list_type()->list_size(); | |
408 | ||
409 | ARROW_CHECK_EQ(data_->child_data.size(), 1); | |
410 | values_ = MakeArray(data_->child_data[0]); | |
411 | } | |
412 | ||
413 | const FixedSizeListType* FixedSizeListArray::list_type() const { | |
414 | return checked_cast<const FixedSizeListType*>(data_->type.get()); | |
415 | } | |
416 | ||
417 | std::shared_ptr<DataType> FixedSizeListArray::value_type() const { | |
418 | return list_type()->value_type(); | |
419 | } | |
420 | ||
421 | std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; } | |
422 | ||
423 | Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays( | |
424 | const std::shared_ptr<Array>& values, int32_t list_size) { | |
425 | if (list_size <= 0) { | |
426 | return Status::Invalid("list_size needs to be a strict positive integer"); | |
427 | } | |
428 | ||
429 | if ((values->length() % list_size) != 0) { | |
430 | return Status::Invalid( | |
431 | "The length of the values Array needs to be a multiple of the list_size"); | |
432 | } | |
433 | int64_t length = values->length() / list_size; | |
434 | auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size); | |
435 | std::shared_ptr<Buffer> validity_buf; | |
436 | ||
437 | return std::make_shared<FixedSizeListArray>(list_type, length, values, validity_buf, | |
438 | /*null_count=*/0, /*offset=*/0); | |
439 | } | |
440 | ||
441 | Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten( | |
442 | MemoryPool* memory_pool) const { | |
443 | return FlattenListArray(*this, memory_pool); | |
444 | } | |
445 | ||
446 | // ---------------------------------------------------------------------- | |
447 | // Struct | |
448 | ||
449 | StructArray::StructArray(const std::shared_ptr<ArrayData>& data) { | |
450 | ARROW_CHECK_EQ(data->type->id(), Type::STRUCT); | |
451 | SetData(data); | |
452 | boxed_fields_.resize(data->child_data.size()); | |
453 | } | |
454 | ||
455 | StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length, | |
456 | const std::vector<std::shared_ptr<Array>>& children, | |
457 | std::shared_ptr<Buffer> null_bitmap, int64_t null_count, | |
458 | int64_t offset) { | |
459 | ARROW_CHECK_EQ(type->id(), Type::STRUCT); | |
460 | SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); | |
461 | for (const auto& child : children) { | |
462 | data_->child_data.push_back(child->data()); | |
463 | } | |
464 | boxed_fields_.resize(children.size()); | |
465 | } | |
466 | ||
467 | Result<std::shared_ptr<StructArray>> StructArray::Make( | |
468 | const std::vector<std::shared_ptr<Array>>& children, | |
469 | const std::vector<std::shared_ptr<Field>>& fields, | |
470 | std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) { | |
471 | if (children.size() != fields.size()) { | |
472 | return Status::Invalid("Mismatching number of fields and child arrays"); | |
473 | } | |
474 | int64_t length = 0; | |
475 | if (children.size() == 0) { | |
476 | return Status::Invalid("Can't infer struct array length with 0 child arrays"); | |
477 | } | |
478 | length = children.front()->length(); | |
479 | for (const auto& child : children) { | |
480 | if (length != child->length()) { | |
481 | return Status::Invalid("Mismatching child array lengths"); | |
482 | } | |
483 | } | |
484 | if (offset > length) { | |
485 | return Status::IndexError("Offset greater than length of child arrays"); | |
486 | } | |
487 | if (null_bitmap == nullptr) { | |
488 | if (null_count > 0) { | |
489 | return Status::Invalid("null_count = ", null_count, " but no null bitmap given"); | |
490 | } | |
491 | null_count = 0; | |
492 | } | |
493 | return std::make_shared<StructArray>(struct_(fields), length - offset, children, | |
494 | null_bitmap, null_count, offset); | |
495 | } | |
496 | ||
497 | Result<std::shared_ptr<StructArray>> StructArray::Make( | |
498 | const std::vector<std::shared_ptr<Array>>& children, | |
499 | const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap, | |
500 | int64_t null_count, int64_t offset) { | |
501 | if (children.size() != field_names.size()) { | |
502 | return Status::Invalid("Mismatching number of field names and child arrays"); | |
503 | } | |
504 | std::vector<std::shared_ptr<Field>> fields(children.size()); | |
505 | for (size_t i = 0; i < children.size(); ++i) { | |
506 | fields[i] = ::arrow::field(field_names[i], children[i]->type()); | |
507 | } | |
508 | return Make(children, fields, std::move(null_bitmap), null_count, offset); | |
509 | } | |
510 | ||
511 | const StructType* StructArray::struct_type() const { | |
512 | return checked_cast<const StructType*>(data_->type.get()); | |
513 | } | |
514 | ||
515 | const ArrayVector& StructArray::fields() const { | |
516 | for (int i = 0; i < num_fields(); ++i) { | |
517 | (void)field(i); | |
518 | } | |
519 | return boxed_fields_; | |
520 | } | |
521 | ||
522 | std::shared_ptr<Array> StructArray::field(int i) const { | |
523 | std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]); | |
524 | if (!result) { | |
525 | std::shared_ptr<ArrayData> field_data; | |
526 | if (data_->offset != 0 || data_->child_data[i]->length != data_->length) { | |
527 | field_data = data_->child_data[i]->Slice(data_->offset, data_->length); | |
528 | } else { | |
529 | field_data = data_->child_data[i]; | |
530 | } | |
531 | result = MakeArray(field_data); | |
532 | internal::atomic_store(&boxed_fields_[i], result); | |
533 | } | |
534 | return result; | |
535 | } | |
536 | ||
537 | std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const { | |
538 | int i = struct_type()->GetFieldIndex(name); | |
539 | return i == -1 ? nullptr : field(i); | |
540 | } | |
541 | ||
542 | Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const { | |
543 | ArrayVector flattened; | |
544 | flattened.reserve(data_->child_data.size()); | |
545 | std::shared_ptr<Buffer> null_bitmap = data_->buffers[0]; | |
546 | ||
547 | for (const auto& child_data_ptr : data_->child_data) { | |
548 | auto child_data = child_data_ptr->Copy(); | |
549 | ||
550 | std::shared_ptr<Buffer> flattened_null_bitmap; | |
551 | int64_t flattened_null_count = kUnknownNullCount; | |
552 | ||
553 | // Need to adjust for parent offset | |
554 | if (data_->offset != 0 || data_->length != child_data->length) { | |
555 | child_data = child_data->Slice(data_->offset, data_->length); | |
556 | } | |
557 | std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0]; | |
558 | const int64_t child_offset = child_data->offset; | |
559 | ||
560 | // The validity of a flattened datum is the logical AND of the struct | |
561 | // element's validity and the individual field element's validity. | |
562 | if (null_bitmap && child_null_bitmap) { | |
563 | ARROW_ASSIGN_OR_RAISE( | |
564 | flattened_null_bitmap, | |
565 | BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_, | |
566 | data_->offset, data_->length, child_offset)); | |
567 | } else if (child_null_bitmap) { | |
568 | flattened_null_bitmap = child_null_bitmap; | |
569 | flattened_null_count = child_data->null_count; | |
570 | } else if (null_bitmap) { | |
571 | if (child_offset == data_->offset) { | |
572 | flattened_null_bitmap = null_bitmap; | |
573 | } else { | |
574 | // If the child has an offset, need to synthesize a validity | |
575 | // buffer with an offset too | |
576 | ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap, | |
577 | AllocateEmptyBitmap(child_offset + data_->length, pool)); | |
578 | CopyBitmap(null_bitmap_data_, data_->offset, data_->length, | |
579 | flattened_null_bitmap->mutable_data(), child_offset); | |
580 | } | |
581 | flattened_null_count = data_->null_count; | |
582 | } else { | |
583 | flattened_null_count = 0; | |
584 | } | |
585 | ||
586 | auto flattened_data = child_data->Copy(); | |
587 | flattened_data->buffers[0] = flattened_null_bitmap; | |
588 | flattened_data->null_count = flattened_null_count; | |
589 | ||
590 | flattened.push_back(MakeArray(flattened_data)); | |
591 | } | |
592 | ||
593 | return flattened; | |
594 | } | |
595 | ||
596 | // ---------------------------------------------------------------------- | |
597 | // UnionArray | |
598 | ||
599 | void UnionArray::SetData(std::shared_ptr<ArrayData> data) { | |
600 | this->Array::SetData(std::move(data)); | |
601 | ||
602 | union_type_ = checked_cast<const UnionType*>(data_->type.get()); | |
603 | ||
604 | ARROW_CHECK_GE(data_->buffers.size(), 2); | |
605 | raw_type_codes_ = data->GetValuesSafe<int8_t>(1, /*offset=*/0); | |
606 | boxed_fields_.resize(data_->child_data.size()); | |
607 | } | |
608 | ||
609 | void SparseUnionArray::SetData(std::shared_ptr<ArrayData> data) { | |
610 | this->UnionArray::SetData(std::move(data)); | |
611 | ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION); | |
612 | ARROW_CHECK_EQ(data_->buffers.size(), 2); | |
613 | ||
614 | // No validity bitmap | |
615 | ARROW_CHECK_EQ(data_->buffers[0], nullptr); | |
616 | } | |
617 | ||
618 | void DenseUnionArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
619 | this->UnionArray::SetData(std::move(data)); | |
620 | ||
621 | ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION); | |
622 | ARROW_CHECK_EQ(data_->buffers.size(), 3); | |
623 | ||
624 | // No validity bitmap | |
625 | ARROW_CHECK_EQ(data_->buffers[0], nullptr); | |
626 | ||
627 | raw_value_offsets_ = data->GetValuesSafe<int32_t>(2, /*offset=*/0); | |
628 | } | |
629 | ||
630 | SparseUnionArray::SparseUnionArray(std::shared_ptr<ArrayData> data) { | |
631 | SetData(std::move(data)); | |
632 | } | |
633 | ||
634 | SparseUnionArray::SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, | |
635 | ArrayVector children, | |
636 | std::shared_ptr<Buffer> type_codes, int64_t offset) { | |
637 | auto internal_data = ArrayData::Make(std::move(type), length, | |
638 | BufferVector{nullptr, std::move(type_codes)}, | |
639 | /*null_count=*/0, offset); | |
640 | for (const auto& child : children) { | |
641 | internal_data->child_data.push_back(child->data()); | |
642 | } | |
643 | SetData(std::move(internal_data)); | |
644 | } | |
645 | ||
646 | DenseUnionArray::DenseUnionArray(const std::shared_ptr<ArrayData>& data) { | |
647 | SetData(data); | |
648 | } | |
649 | ||
650 | DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, | |
651 | ArrayVector children, std::shared_ptr<Buffer> type_ids, | |
652 | std::shared_ptr<Buffer> value_offsets, int64_t offset) { | |
653 | auto internal_data = ArrayData::Make( | |
654 | std::move(type), length, | |
655 | BufferVector{nullptr, std::move(type_ids), std::move(value_offsets)}, | |
656 | /*null_count=*/0, offset); | |
657 | for (const auto& child : children) { | |
658 | internal_data->child_data.push_back(child->data()); | |
659 | } | |
660 | SetData(internal_data); | |
661 | } | |
662 | ||
663 | Result<std::shared_ptr<Array>> DenseUnionArray::Make( | |
664 | const Array& type_ids, const Array& value_offsets, ArrayVector children, | |
665 | std::vector<std::string> field_names, std::vector<type_code_t> type_codes) { | |
666 | if (value_offsets.length() == 0) { | |
667 | return Status::Invalid("UnionArray offsets must have non-zero length"); | |
668 | } | |
669 | ||
670 | if (value_offsets.type_id() != Type::INT32) { | |
671 | return Status::TypeError("UnionArray offsets must be signed int32"); | |
672 | } | |
673 | ||
674 | if (type_ids.type_id() != Type::INT8) { | |
675 | return Status::TypeError("UnionArray type_ids must be signed int8"); | |
676 | } | |
677 | ||
678 | if (type_ids.null_count() != 0) { | |
679 | return Status::Invalid("Union type ids may not have nulls"); | |
680 | } | |
681 | ||
682 | if (value_offsets.null_count() != 0) { | |
683 | return Status::Invalid("Make does not allow nulls in value_offsets"); | |
684 | } | |
685 | ||
686 | if (field_names.size() > 0 && field_names.size() != children.size()) { | |
687 | return Status::Invalid("field_names must have the same length as children"); | |
688 | } | |
689 | ||
690 | if (type_codes.size() > 0 && type_codes.size() != children.size()) { | |
691 | return Status::Invalid("type_codes must have the same length as children"); | |
692 | } | |
693 | ||
694 | BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values(), | |
695 | checked_cast<const Int32Array&>(value_offsets).values()}; | |
696 | ||
697 | auto union_type = dense_union(children, std::move(field_names), std::move(type_codes)); | |
698 | auto internal_data = | |
699 | ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers), | |
700 | /*null_count=*/0, type_ids.offset()); | |
701 | for (const auto& child : children) { | |
702 | internal_data->child_data.push_back(child->data()); | |
703 | } | |
704 | return std::make_shared<DenseUnionArray>(std::move(internal_data)); | |
705 | } | |
706 | ||
707 | Result<std::shared_ptr<Array>> SparseUnionArray::Make( | |
708 | const Array& type_ids, ArrayVector children, std::vector<std::string> field_names, | |
709 | std::vector<int8_t> type_codes) { | |
710 | if (type_ids.type_id() != Type::INT8) { | |
711 | return Status::TypeError("UnionArray type_ids must be signed int8"); | |
712 | } | |
713 | ||
714 | if (type_ids.null_count() != 0) { | |
715 | return Status::Invalid("Union type ids may not have nulls"); | |
716 | } | |
717 | ||
718 | if (field_names.size() > 0 && field_names.size() != children.size()) { | |
719 | return Status::Invalid("field_names must have the same length as children"); | |
720 | } | |
721 | ||
722 | if (type_codes.size() > 0 && type_codes.size() != children.size()) { | |
723 | return Status::Invalid("type_codes must have the same length as children"); | |
724 | } | |
725 | ||
726 | BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values()}; | |
727 | auto union_type = sparse_union(children, std::move(field_names), std::move(type_codes)); | |
728 | auto internal_data = | |
729 | ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers), | |
730 | /*null_count=*/0, type_ids.offset()); | |
731 | for (const auto& child : children) { | |
732 | internal_data->child_data.push_back(child->data()); | |
733 | if (child->length() != type_ids.length()) { | |
734 | return Status::Invalid( | |
735 | "Sparse UnionArray must have len(child) == len(type_ids) for all children"); | |
736 | } | |
737 | } | |
738 | return std::make_shared<SparseUnionArray>(std::move(internal_data)); | |
739 | } | |
740 | ||
741 | std::shared_ptr<Array> UnionArray::field(int i) const { | |
742 | if (i < 0 || | |
743 | static_cast<decltype(boxed_fields_)::size_type>(i) >= boxed_fields_.size()) { | |
744 | return nullptr; | |
745 | } | |
746 | std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]); | |
747 | if (!result) { | |
748 | std::shared_ptr<ArrayData> child_data = data_->child_data[i]->Copy(); | |
749 | if (mode() == UnionMode::SPARSE) { | |
750 | // Sparse union: need to adjust child if union is sliced | |
751 | // (for dense unions, the need to lookup through the offsets | |
752 | // makes this unnecessary) | |
753 | if (data_->offset != 0 || child_data->length > data_->length) { | |
754 | child_data = child_data->Slice(data_->offset, data_->length); | |
755 | } | |
756 | } | |
757 | result = MakeArray(child_data); | |
758 | internal::atomic_store(&boxed_fields_[i], result); | |
759 | } | |
760 | return result; | |
761 | } | |
762 | ||
763 | } // namespace arrow |