1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 // Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
29 #include "arrow/array/array_base.h"
30 #include "arrow/array/data.h"
31 #include "arrow/result.h"
32 #include "arrow/status.h"
33 #include "arrow/type.h"
34 #include "arrow/type_fwd.h"
35 #include "arrow/util/checked_cast.h"
36 #include "arrow/util/macros.h"
37 #include "arrow/util/visibility.h"
41 /// \addtogroup nested-arrays
45 // ----------------------------------------------------------------------
48 template <typename TYPE
>
53 // Private helper for ListArray::SetData.
54 // Unfortunately, trying to define BaseListArray::SetData outside of this header
55 // doesn't play well with MSVC.
56 template <typename TYPE
>
57 void SetListData(BaseListArray
<TYPE
>* self
, const std::shared_ptr
<ArrayData
>& data
,
58 Type::type expected_type_id
= TYPE::type_id
);
60 } // namespace internal
62 /// Base class for variable-sized list arrays, regardless of offset size.
63 template <typename TYPE
>
64 class BaseListArray
: public Array
{
66 using TypeClass
= TYPE
;
67 using offset_type
= typename
TypeClass::offset_type
;
69 const TypeClass
* list_type() const { return list_type_
; }
71 /// \brief Return array object containing the list's values
72 std::shared_ptr
<Array
> values() const { return values_
; }
74 /// Note that this buffer does not account for any slice offset
75 std::shared_ptr
<Buffer
> value_offsets() const { return data_
->buffers
[1]; }
77 std::shared_ptr
<DataType
> value_type() const { return list_type_
->value_type(); }
79 /// Return pointer to raw value offsets accounting for any slice offset
80 const offset_type
* raw_value_offsets() const {
81 return raw_value_offsets_
+ data_
->offset
;
84 // The following functions will not perform boundschecking
85 offset_type
value_offset(int64_t i
) const {
86 return raw_value_offsets_
[i
+ data_
->offset
];
88 offset_type
value_length(int64_t i
) const {
90 return raw_value_offsets_
[i
+ 1] - raw_value_offsets_
[i
];
92 std::shared_ptr
<Array
> value_slice(int64_t i
) const {
93 return values_
->Slice(value_offset(i
), value_length(i
));
97 friend void internal::SetListData
<TYPE
>(BaseListArray
<TYPE
>* self
,
98 const std::shared_ptr
<ArrayData
>& data
,
99 Type::type expected_type_id
);
101 const TypeClass
* list_type_
= NULLPTR
;
102 std::shared_ptr
<Array
> values_
;
103 const offset_type
* raw_value_offsets_
= NULLPTR
;
106 /// Concrete Array class for list data
107 class ARROW_EXPORT ListArray
: public BaseListArray
<ListType
> {
109 explicit ListArray(std::shared_ptr
<ArrayData
> data
);
111 ListArray(std::shared_ptr
<DataType
> type
, int64_t length
,
112 std::shared_ptr
<Buffer
> value_offsets
, std::shared_ptr
<Array
> values
,
113 std::shared_ptr
<Buffer
> null_bitmap
= NULLPTR
,
114 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
116 /// \brief Construct ListArray from array of offsets and child value array
118 /// This function does the bare minimum of validation of the offsets and
119 /// input types, and will allocate a new offsets array if necessary (i.e. if
120 /// the offsets contain any nulls). If the offsets do not have nulls, they
121 /// are assumed to be well-formed
123 /// \param[in] offsets Array containing n + 1 offsets encoding length and
124 /// size. Must be of int32 type
125 /// \param[in] values Array containing list values
126 /// \param[in] pool MemoryPool in case new offsets array needs to be
127 /// allocated because of null values
128 static Result
<std::shared_ptr
<ListArray
>> FromArrays(
129 const Array
& offsets
, const Array
& values
,
130 MemoryPool
* pool
= default_memory_pool());
132 /// \brief Return an Array that is a concatenation of the lists in this array.
134 /// Note that it's different from `values()` in that it takes into
135 /// consideration of this array's offsets as well as null elements backed
136 /// by non-empty lists (they are skipped, thus copying may be needed).
137 Result
<std::shared_ptr
<Array
>> Flatten(
138 MemoryPool
* memory_pool
= default_memory_pool()) const;
140 /// \brief Return list offsets as an Int32Array
141 std::shared_ptr
<Array
> offsets() const;
144 // This constructor defers SetData to a derived array class
145 ListArray() = default;
147 void SetData(const std::shared_ptr
<ArrayData
>& data
);
150 /// Concrete Array class for large list data (with 64-bit offsets)
151 class ARROW_EXPORT LargeListArray
: public BaseListArray
<LargeListType
> {
153 explicit LargeListArray(const std::shared_ptr
<ArrayData
>& data
);
155 LargeListArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
156 const std::shared_ptr
<Buffer
>& value_offsets
,
157 const std::shared_ptr
<Array
>& values
,
158 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
159 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
161 /// \brief Construct LargeListArray from array of offsets and child value array
163 /// This function does the bare minimum of validation of the offsets and
164 /// input types, and will allocate a new offsets array if necessary (i.e. if
165 /// the offsets contain any nulls). If the offsets do not have nulls, they
166 /// are assumed to be well-formed
168 /// \param[in] offsets Array containing n + 1 offsets encoding length and
169 /// size. Must be of int64 type
170 /// \param[in] values Array containing list values
171 /// \param[in] pool MemoryPool in case new offsets array needs to be
172 /// allocated because of null values
173 static Result
<std::shared_ptr
<LargeListArray
>> FromArrays(
174 const Array
& offsets
, const Array
& values
,
175 MemoryPool
* pool
= default_memory_pool());
177 /// \brief Return an Array that is a concatenation of the lists in this array.
179 /// Note that it's different from `values()` in that it takes into
180 /// consideration of this array's offsets as well as null elements backed
181 /// by non-empty lists (they are skipped, thus copying may be needed).
182 Result
<std::shared_ptr
<Array
>> Flatten(
183 MemoryPool
* memory_pool
= default_memory_pool()) const;
185 /// \brief Return list offsets as an Int64Array
186 std::shared_ptr
<Array
> offsets() const;
189 void SetData(const std::shared_ptr
<ArrayData
>& data
);
192 // ----------------------------------------------------------------------
195 /// Concrete Array class for map data
197 /// NB: "value" in this context refers to a pair of a key and the corresponding item
198 class ARROW_EXPORT MapArray
: public ListArray
{
200 using TypeClass
= MapType
;
202 explicit MapArray(const std::shared_ptr
<ArrayData
>& data
);
204 MapArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
205 const std::shared_ptr
<Buffer
>& value_offsets
,
206 const std::shared_ptr
<Array
>& keys
, const std::shared_ptr
<Array
>& items
,
207 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
208 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
210 MapArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
211 const std::shared_ptr
<Buffer
>& value_offsets
,
212 const std::shared_ptr
<Array
>& values
,
213 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
214 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
216 /// \brief Construct MapArray from array of offsets and child key, item arrays
218 /// This function does the bare minimum of validation of the offsets and
219 /// input types, and will allocate a new offsets array if necessary (i.e. if
220 /// the offsets contain any nulls). If the offsets do not have nulls, they
221 /// are assumed to be well-formed
223 /// \param[in] offsets Array containing n + 1 offsets encoding length and
224 /// size. Must be of int32 type
225 /// \param[in] keys Array containing key values
226 /// \param[in] items Array containing item values
227 /// \param[in] pool MemoryPool in case new offsets array needs to be
228 /// allocated because of null values
229 static Result
<std::shared_ptr
<Array
>> FromArrays(
230 const std::shared_ptr
<Array
>& offsets
, const std::shared_ptr
<Array
>& keys
,
231 const std::shared_ptr
<Array
>& items
, MemoryPool
* pool
= default_memory_pool());
233 static Result
<std::shared_ptr
<Array
>> FromArrays(
234 std::shared_ptr
<DataType
> type
, const std::shared_ptr
<Array
>& offsets
,
235 const std::shared_ptr
<Array
>& keys
, const std::shared_ptr
<Array
>& items
,
236 MemoryPool
* pool
= default_memory_pool());
238 const MapType
* map_type() const { return map_type_
; }
240 /// \brief Return array object containing all map keys
241 std::shared_ptr
<Array
> keys() const { return keys_
; }
243 /// \brief Return array object containing all mapped items
244 std::shared_ptr
<Array
> items() const { return items_
; }
246 /// Validate child data before constructing the actual MapArray.
247 static Status
ValidateChildData(
248 const std::vector
<std::shared_ptr
<ArrayData
>>& child_data
);
251 void SetData(const std::shared_ptr
<ArrayData
>& data
);
253 static Result
<std::shared_ptr
<Array
>> FromArraysInternal(
254 std::shared_ptr
<DataType
> type
, const std::shared_ptr
<Array
>& offsets
,
255 const std::shared_ptr
<Array
>& keys
, const std::shared_ptr
<Array
>& items
,
259 const MapType
* map_type_
;
260 std::shared_ptr
<Array
> keys_
, items_
;
263 // ----------------------------------------------------------------------
264 // FixedSizeListArray
266 /// Concrete Array class for fixed size list data
267 class ARROW_EXPORT FixedSizeListArray
: public Array
{
269 using TypeClass
= FixedSizeListType
;
270 using offset_type
= TypeClass::offset_type
;
272 explicit FixedSizeListArray(const std::shared_ptr
<ArrayData
>& data
);
274 FixedSizeListArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
275 const std::shared_ptr
<Array
>& values
,
276 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
277 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
279 const FixedSizeListType
* list_type() const;
281 /// \brief Return array object containing the list's values
282 std::shared_ptr
<Array
> values() const;
284 std::shared_ptr
<DataType
> value_type() const;
286 // The following functions will not perform boundschecking
287 int32_t value_offset(int64_t i
) const {
289 return static_cast<int32_t>(list_size_
* i
);
291 int32_t value_length(int64_t i
= 0) const {
295 std::shared_ptr
<Array
> value_slice(int64_t i
) const {
296 return values_
->Slice(value_offset(i
), value_length(i
));
299 /// \brief Return an Array that is a concatenation of the lists in this array.
301 /// Note that it's different from `values()` in that it takes into
302 /// consideration null elements (they are skipped, thus copying may be needed).
303 Result
<std::shared_ptr
<Array
>> Flatten(
304 MemoryPool
* memory_pool
= default_memory_pool()) const;
306 /// \brief Construct FixedSizeListArray from child value array and value_length
308 /// \param[in] values Array containing list values
309 /// \param[in] list_size The fixed length of each list
310 /// \return Will have length equal to values.length() / list_size
311 static Result
<std::shared_ptr
<Array
>> FromArrays(const std::shared_ptr
<Array
>& values
,
315 void SetData(const std::shared_ptr
<ArrayData
>& data
);
319 std::shared_ptr
<Array
> values_
;
322 // ----------------------------------------------------------------------
325 /// Concrete Array class for struct data
326 class ARROW_EXPORT StructArray
: public Array
{
328 using TypeClass
= StructType
;
330 explicit StructArray(const std::shared_ptr
<ArrayData
>& data
);
332 StructArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
333 const std::vector
<std::shared_ptr
<Array
>>& children
,
334 std::shared_ptr
<Buffer
> null_bitmap
= NULLPTR
,
335 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
337 /// \brief Return a StructArray from child arrays and field names.
339 /// The length and data type are automatically inferred from the arguments.
340 /// There should be at least one child array.
341 static Result
<std::shared_ptr
<StructArray
>> Make(
342 const ArrayVector
& children
, const std::vector
<std::string
>& field_names
,
343 std::shared_ptr
<Buffer
> null_bitmap
= NULLPTR
,
344 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
346 /// \brief Return a StructArray from child arrays and fields.
348 /// The length is automatically inferred from the arguments.
349 /// There should be at least one child array. This method does not
350 /// check that field types and child array types are consistent.
351 static Result
<std::shared_ptr
<StructArray
>> Make(
352 const ArrayVector
& children
, const FieldVector
& fields
,
353 std::shared_ptr
<Buffer
> null_bitmap
= NULLPTR
,
354 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
356 const StructType
* struct_type() const;
358 // Return a shared pointer in case the requestor desires to share ownership
359 // with this array. The returned array has its offset, length and null
361 std::shared_ptr
<Array
> field(int pos
) const;
363 const ArrayVector
& fields() const;
365 /// Returns null if name not found
366 std::shared_ptr
<Array
> GetFieldByName(const std::string
& name
) const;
368 /// \brief Flatten this array as a vector of arrays, one for each field
370 /// \param[in] pool The pool to allocate null bitmaps from, if necessary
371 Result
<ArrayVector
> Flatten(MemoryPool
* pool
= default_memory_pool()) const;
374 // For caching boxed child data
375 // XXX This is not handled in a thread-safe manner.
376 mutable ArrayVector boxed_fields_
;
379 // ----------------------------------------------------------------------
382 /// Base class for SparseUnionArray and DenseUnionArray
383 class ARROW_EXPORT UnionArray
: public Array
{
385 using type_code_t
= int8_t;
387 /// Note that this buffer does not account for any slice offset
388 std::shared_ptr
<Buffer
> type_codes() const { return data_
->buffers
[1]; }
390 const type_code_t
* raw_type_codes() const { return raw_type_codes_
+ data_
->offset
; }
392 /// The logical type code of the value at index.
393 type_code_t
type_code(int64_t i
) const { return raw_type_codes_
[i
+ data_
->offset
]; }
395 /// The physical child id containing value at index.
396 int child_id(int64_t i
) const {
397 return union_type_
->child_ids()[raw_type_codes_
[i
+ data_
->offset
]];
400 const UnionType
* union_type() const { return union_type_
; }
402 UnionMode::type
mode() const { return union_type_
->mode(); }
404 /// \brief Return the given field as an individual array.
406 /// For sparse unions, the returned array has its offset, length and null
408 std::shared_ptr
<Array
> field(int pos
) const;
411 void SetData(std::shared_ptr
<ArrayData
> data
);
413 const type_code_t
* raw_type_codes_
;
414 const UnionType
* union_type_
;
416 // For caching boxed child data
417 mutable std::vector
<std::shared_ptr
<Array
>> boxed_fields_
;
420 /// Concrete Array class for sparse union data
421 class ARROW_EXPORT SparseUnionArray
: public UnionArray
{
423 using TypeClass
= SparseUnionType
;
425 explicit SparseUnionArray(std::shared_ptr
<ArrayData
> data
);
427 SparseUnionArray(std::shared_ptr
<DataType
> type
, int64_t length
, ArrayVector children
,
428 std::shared_ptr
<Buffer
> type_ids
, int64_t offset
= 0);
430 /// \brief Construct SparseUnionArray from type_ids and children
432 /// This function does the bare minimum of validation of the input types.
434 /// \param[in] type_ids An array of logical type ids for the union type
435 /// \param[in] children Vector of children Arrays containing the data for each type.
436 /// \param[in] type_codes Vector of type codes.
437 static Result
<std::shared_ptr
<Array
>> Make(const Array
& type_ids
, ArrayVector children
,
438 std::vector
<type_code_t
> type_codes
) {
439 return Make(std::move(type_ids
), std::move(children
), std::vector
<std::string
>{},
440 std::move(type_codes
));
443 /// \brief Construct SparseUnionArray with custom field names from type_ids and children
445 /// This function does the bare minimum of validation of the input types.
447 /// \param[in] type_ids An array of logical type ids for the union type
448 /// \param[in] children Vector of children Arrays containing the data for each type.
449 /// \param[in] field_names Vector of strings containing the name of each field.
450 /// \param[in] type_codes Vector of type codes.
451 static Result
<std::shared_ptr
<Array
>> Make(const Array
& type_ids
, ArrayVector children
,
452 std::vector
<std::string
> field_names
= {},
453 std::vector
<type_code_t
> type_codes
= {});
455 const SparseUnionType
* union_type() const {
456 return internal::checked_cast
<const SparseUnionType
*>(union_type_
);
460 void SetData(std::shared_ptr
<ArrayData
> data
);
463 /// \brief Concrete Array class for dense union data
465 /// Note that union types do not have a validity bitmap
466 class ARROW_EXPORT DenseUnionArray
: public UnionArray
{
468 using TypeClass
= DenseUnionType
;
470 explicit DenseUnionArray(const std::shared_ptr
<ArrayData
>& data
);
472 DenseUnionArray(std::shared_ptr
<DataType
> type
, int64_t length
, ArrayVector children
,
473 std::shared_ptr
<Buffer
> type_ids
,
474 std::shared_ptr
<Buffer
> value_offsets
= NULLPTR
, int64_t offset
= 0);
476 /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
478 /// This function does the bare minimum of validation of the offsets and
481 /// \param[in] type_ids An array of logical type ids for the union type
482 /// \param[in] value_offsets An array of signed int32 values indicating the
483 /// relative offset into the respective child array for the type in a given slot.
484 /// The respective offsets for each child value array must be in order / increasing.
485 /// \param[in] children Vector of children Arrays containing the data for each type.
486 /// \param[in] type_codes Vector of type codes.
487 static Result
<std::shared_ptr
<Array
>> Make(const Array
& type_ids
,
488 const Array
& value_offsets
,
489 ArrayVector children
,
490 std::vector
<type_code_t
> type_codes
) {
491 return Make(type_ids
, value_offsets
, std::move(children
), std::vector
<std::string
>{},
492 std::move(type_codes
));
495 /// \brief Construct DenseUnionArray with custom field names from type_ids,
496 /// value_offsets, and children
498 /// This function does the bare minimum of validation of the offsets and
501 /// \param[in] type_ids An array of logical type ids for the union type
502 /// \param[in] value_offsets An array of signed int32 values indicating the
503 /// relative offset into the respective child array for the type in a given slot.
504 /// The respective offsets for each child value array must be in order / increasing.
505 /// \param[in] children Vector of children Arrays containing the data for each type.
506 /// \param[in] field_names Vector of strings containing the name of each field.
507 /// \param[in] type_codes Vector of type codes.
508 static Result
<std::shared_ptr
<Array
>> Make(const Array
& type_ids
,
509 const Array
& value_offsets
,
510 ArrayVector children
,
511 std::vector
<std::string
> field_names
= {},
512 std::vector
<type_code_t
> type_codes
= {});
514 const DenseUnionType
* union_type() const {
515 return internal::checked_cast
<const DenseUnionType
*>(union_type_
);
518 /// Note that this buffer does not account for any slice offset
519 std::shared_ptr
<Buffer
> value_offsets() const { return data_
->buffers
[2]; }
521 int32_t value_offset(int64_t i
) const { return raw_value_offsets_
[i
+ data_
->offset
]; }
523 const int32_t* raw_value_offsets() const { return raw_value_offsets_
+ data_
->offset
; }
526 const int32_t* raw_value_offsets_
;
528 void SetData(const std::shared_ptr
<ArrayData
>& data
);