]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/array/array_nested.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / array / array_nested.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
19 // Union
20
21 #pragma once
22
23 #include <cstdint>
24 #include <memory>
25 #include <string>
26 #include <utility>
27 #include <vector>
28
29 #include "arrow/array/array_base.h"
30 #include "arrow/array/data.h"
31 #include "arrow/result.h"
32 #include "arrow/status.h"
33 #include "arrow/type.h"
34 #include "arrow/type_fwd.h"
35 #include "arrow/util/checked_cast.h"
36 #include "arrow/util/macros.h"
37 #include "arrow/util/visibility.h"
38
39 namespace arrow {
40
41 /// \addtogroup nested-arrays
42 ///
43 /// @{
44
45 // ----------------------------------------------------------------------
46 // ListArray
47
48 template <typename TYPE>
49 class BaseListArray;
50
51 namespace internal {
52
53 // Private helper for ListArray::SetData.
54 // Unfortunately, trying to define BaseListArray::SetData outside of this header
55 // doesn't play well with MSVC.
56 template <typename TYPE>
57 void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
58 Type::type expected_type_id = TYPE::type_id);
59
60 } // namespace internal
61
62 /// Base class for variable-sized list arrays, regardless of offset size.
63 template <typename TYPE>
64 class BaseListArray : public Array {
65 public:
66 using TypeClass = TYPE;
67 using offset_type = typename TypeClass::offset_type;
68
69 const TypeClass* list_type() const { return list_type_; }
70
71 /// \brief Return array object containing the list's values
72 std::shared_ptr<Array> values() const { return values_; }
73
74 /// Note that this buffer does not account for any slice offset
75 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
76
77 std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
78
79 /// Return pointer to raw value offsets accounting for any slice offset
80 const offset_type* raw_value_offsets() const {
81 return raw_value_offsets_ + data_->offset;
82 }
83
84 // The following functions will not perform boundschecking
85 offset_type value_offset(int64_t i) const {
86 return raw_value_offsets_[i + data_->offset];
87 }
88 offset_type value_length(int64_t i) const {
89 i += data_->offset;
90 return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
91 }
92 std::shared_ptr<Array> value_slice(int64_t i) const {
93 return values_->Slice(value_offset(i), value_length(i));
94 }
95
96 protected:
97 friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
98 const std::shared_ptr<ArrayData>& data,
99 Type::type expected_type_id);
100
101 const TypeClass* list_type_ = NULLPTR;
102 std::shared_ptr<Array> values_;
103 const offset_type* raw_value_offsets_ = NULLPTR;
104 };
105
106 /// Concrete Array class for list data
107 class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
108 public:
109 explicit ListArray(std::shared_ptr<ArrayData> data);
110
111 ListArray(std::shared_ptr<DataType> type, int64_t length,
112 std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
113 std::shared_ptr<Buffer> null_bitmap = NULLPTR,
114 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
115
116 /// \brief Construct ListArray from array of offsets and child value array
117 ///
118 /// This function does the bare minimum of validation of the offsets and
119 /// input types, and will allocate a new offsets array if necessary (i.e. if
120 /// the offsets contain any nulls). If the offsets do not have nulls, they
121 /// are assumed to be well-formed
122 ///
123 /// \param[in] offsets Array containing n + 1 offsets encoding length and
124 /// size. Must be of int32 type
125 /// \param[in] values Array containing list values
126 /// \param[in] pool MemoryPool in case new offsets array needs to be
127 /// allocated because of null values
128 static Result<std::shared_ptr<ListArray>> FromArrays(
129 const Array& offsets, const Array& values,
130 MemoryPool* pool = default_memory_pool());
131
132 /// \brief Return an Array that is a concatenation of the lists in this array.
133 ///
134 /// Note that it's different from `values()` in that it takes into
135 /// consideration of this array's offsets as well as null elements backed
136 /// by non-empty lists (they are skipped, thus copying may be needed).
137 Result<std::shared_ptr<Array>> Flatten(
138 MemoryPool* memory_pool = default_memory_pool()) const;
139
140 /// \brief Return list offsets as an Int32Array
141 std::shared_ptr<Array> offsets() const;
142
143 protected:
144 // This constructor defers SetData to a derived array class
145 ListArray() = default;
146
147 void SetData(const std::shared_ptr<ArrayData>& data);
148 };
149
150 /// Concrete Array class for large list data (with 64-bit offsets)
151 class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
152 public:
153 explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
154
155 LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
156 const std::shared_ptr<Buffer>& value_offsets,
157 const std::shared_ptr<Array>& values,
158 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
159 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
160
161 /// \brief Construct LargeListArray from array of offsets and child value array
162 ///
163 /// This function does the bare minimum of validation of the offsets and
164 /// input types, and will allocate a new offsets array if necessary (i.e. if
165 /// the offsets contain any nulls). If the offsets do not have nulls, they
166 /// are assumed to be well-formed
167 ///
168 /// \param[in] offsets Array containing n + 1 offsets encoding length and
169 /// size. Must be of int64 type
170 /// \param[in] values Array containing list values
171 /// \param[in] pool MemoryPool in case new offsets array needs to be
172 /// allocated because of null values
173 static Result<std::shared_ptr<LargeListArray>> FromArrays(
174 const Array& offsets, const Array& values,
175 MemoryPool* pool = default_memory_pool());
176
177 /// \brief Return an Array that is a concatenation of the lists in this array.
178 ///
179 /// Note that it's different from `values()` in that it takes into
180 /// consideration of this array's offsets as well as null elements backed
181 /// by non-empty lists (they are skipped, thus copying may be needed).
182 Result<std::shared_ptr<Array>> Flatten(
183 MemoryPool* memory_pool = default_memory_pool()) const;
184
185 /// \brief Return list offsets as an Int64Array
186 std::shared_ptr<Array> offsets() const;
187
188 protected:
189 void SetData(const std::shared_ptr<ArrayData>& data);
190 };
191
192 // ----------------------------------------------------------------------
193 // MapArray
194
195 /// Concrete Array class for map data
196 ///
197 /// NB: "value" in this context refers to a pair of a key and the corresponding item
198 class ARROW_EXPORT MapArray : public ListArray {
199 public:
200 using TypeClass = MapType;
201
202 explicit MapArray(const std::shared_ptr<ArrayData>& data);
203
204 MapArray(const std::shared_ptr<DataType>& type, int64_t length,
205 const std::shared_ptr<Buffer>& value_offsets,
206 const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
207 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
208 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
209
210 MapArray(const std::shared_ptr<DataType>& type, int64_t length,
211 const std::shared_ptr<Buffer>& value_offsets,
212 const std::shared_ptr<Array>& values,
213 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
214 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
215
216 /// \brief Construct MapArray from array of offsets and child key, item arrays
217 ///
218 /// This function does the bare minimum of validation of the offsets and
219 /// input types, and will allocate a new offsets array if necessary (i.e. if
220 /// the offsets contain any nulls). If the offsets do not have nulls, they
221 /// are assumed to be well-formed
222 ///
223 /// \param[in] offsets Array containing n + 1 offsets encoding length and
224 /// size. Must be of int32 type
225 /// \param[in] keys Array containing key values
226 /// \param[in] items Array containing item values
227 /// \param[in] pool MemoryPool in case new offsets array needs to be
228 /// allocated because of null values
229 static Result<std::shared_ptr<Array>> FromArrays(
230 const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
231 const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
232
233 static Result<std::shared_ptr<Array>> FromArrays(
234 std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
235 const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
236 MemoryPool* pool = default_memory_pool());
237
238 const MapType* map_type() const { return map_type_; }
239
240 /// \brief Return array object containing all map keys
241 std::shared_ptr<Array> keys() const { return keys_; }
242
243 /// \brief Return array object containing all mapped items
244 std::shared_ptr<Array> items() const { return items_; }
245
246 /// Validate child data before constructing the actual MapArray.
247 static Status ValidateChildData(
248 const std::vector<std::shared_ptr<ArrayData>>& child_data);
249
250 protected:
251 void SetData(const std::shared_ptr<ArrayData>& data);
252
253 static Result<std::shared_ptr<Array>> FromArraysInternal(
254 std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
255 const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
256 MemoryPool* pool);
257
258 private:
259 const MapType* map_type_;
260 std::shared_ptr<Array> keys_, items_;
261 };
262
263 // ----------------------------------------------------------------------
264 // FixedSizeListArray
265
266 /// Concrete Array class for fixed size list data
267 class ARROW_EXPORT FixedSizeListArray : public Array {
268 public:
269 using TypeClass = FixedSizeListType;
270 using offset_type = TypeClass::offset_type;
271
272 explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
273
274 FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
275 const std::shared_ptr<Array>& values,
276 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
277 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
278
279 const FixedSizeListType* list_type() const;
280
281 /// \brief Return array object containing the list's values
282 std::shared_ptr<Array> values() const;
283
284 std::shared_ptr<DataType> value_type() const;
285
286 // The following functions will not perform boundschecking
287 int32_t value_offset(int64_t i) const {
288 i += data_->offset;
289 return static_cast<int32_t>(list_size_ * i);
290 }
291 int32_t value_length(int64_t i = 0) const {
292 ARROW_UNUSED(i);
293 return list_size_;
294 }
295 std::shared_ptr<Array> value_slice(int64_t i) const {
296 return values_->Slice(value_offset(i), value_length(i));
297 }
298
299 /// \brief Return an Array that is a concatenation of the lists in this array.
300 ///
301 /// Note that it's different from `values()` in that it takes into
302 /// consideration null elements (they are skipped, thus copying may be needed).
303 Result<std::shared_ptr<Array>> Flatten(
304 MemoryPool* memory_pool = default_memory_pool()) const;
305
306 /// \brief Construct FixedSizeListArray from child value array and value_length
307 ///
308 /// \param[in] values Array containing list values
309 /// \param[in] list_size The fixed length of each list
310 /// \return Will have length equal to values.length() / list_size
311 static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
312 int32_t list_size);
313
314 protected:
315 void SetData(const std::shared_ptr<ArrayData>& data);
316 int32_t list_size_;
317
318 private:
319 std::shared_ptr<Array> values_;
320 };
321
322 // ----------------------------------------------------------------------
323 // Struct
324
325 /// Concrete Array class for struct data
326 class ARROW_EXPORT StructArray : public Array {
327 public:
328 using TypeClass = StructType;
329
330 explicit StructArray(const std::shared_ptr<ArrayData>& data);
331
332 StructArray(const std::shared_ptr<DataType>& type, int64_t length,
333 const std::vector<std::shared_ptr<Array>>& children,
334 std::shared_ptr<Buffer> null_bitmap = NULLPTR,
335 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
336
337 /// \brief Return a StructArray from child arrays and field names.
338 ///
339 /// The length and data type are automatically inferred from the arguments.
340 /// There should be at least one child array.
341 static Result<std::shared_ptr<StructArray>> Make(
342 const ArrayVector& children, const std::vector<std::string>& field_names,
343 std::shared_ptr<Buffer> null_bitmap = NULLPTR,
344 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
345
346 /// \brief Return a StructArray from child arrays and fields.
347 ///
348 /// The length is automatically inferred from the arguments.
349 /// There should be at least one child array. This method does not
350 /// check that field types and child array types are consistent.
351 static Result<std::shared_ptr<StructArray>> Make(
352 const ArrayVector& children, const FieldVector& fields,
353 std::shared_ptr<Buffer> null_bitmap = NULLPTR,
354 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
355
356 const StructType* struct_type() const;
357
358 // Return a shared pointer in case the requestor desires to share ownership
359 // with this array. The returned array has its offset, length and null
360 // count adjusted.
361 std::shared_ptr<Array> field(int pos) const;
362
363 const ArrayVector& fields() const;
364
365 /// Returns null if name not found
366 std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
367
368 /// \brief Flatten this array as a vector of arrays, one for each field
369 ///
370 /// \param[in] pool The pool to allocate null bitmaps from, if necessary
371 Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
372
373 private:
374 // For caching boxed child data
375 // XXX This is not handled in a thread-safe manner.
376 mutable ArrayVector boxed_fields_;
377 };
378
379 // ----------------------------------------------------------------------
380 // Union
381
382 /// Base class for SparseUnionArray and DenseUnionArray
383 class ARROW_EXPORT UnionArray : public Array {
384 public:
385 using type_code_t = int8_t;
386
387 /// Note that this buffer does not account for any slice offset
388 std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
389
390 const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
391
392 /// The logical type code of the value at index.
393 type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; }
394
395 /// The physical child id containing value at index.
396 int child_id(int64_t i) const {
397 return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
398 }
399
400 const UnionType* union_type() const { return union_type_; }
401
402 UnionMode::type mode() const { return union_type_->mode(); }
403
404 /// \brief Return the given field as an individual array.
405 ///
406 /// For sparse unions, the returned array has its offset, length and null
407 /// count adjusted.
408 std::shared_ptr<Array> field(int pos) const;
409
410 protected:
411 void SetData(std::shared_ptr<ArrayData> data);
412
413 const type_code_t* raw_type_codes_;
414 const UnionType* union_type_;
415
416 // For caching boxed child data
417 mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
418 };
419
420 /// Concrete Array class for sparse union data
421 class ARROW_EXPORT SparseUnionArray : public UnionArray {
422 public:
423 using TypeClass = SparseUnionType;
424
425 explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
426
427 SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
428 std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
429
430 /// \brief Construct SparseUnionArray from type_ids and children
431 ///
432 /// This function does the bare minimum of validation of the input types.
433 ///
434 /// \param[in] type_ids An array of logical type ids for the union type
435 /// \param[in] children Vector of children Arrays containing the data for each type.
436 /// \param[in] type_codes Vector of type codes.
437 static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
438 std::vector<type_code_t> type_codes) {
439 return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
440 std::move(type_codes));
441 }
442
443 /// \brief Construct SparseUnionArray with custom field names from type_ids and children
444 ///
445 /// This function does the bare minimum of validation of the input types.
446 ///
447 /// \param[in] type_ids An array of logical type ids for the union type
448 /// \param[in] children Vector of children Arrays containing the data for each type.
449 /// \param[in] field_names Vector of strings containing the name of each field.
450 /// \param[in] type_codes Vector of type codes.
451 static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
452 std::vector<std::string> field_names = {},
453 std::vector<type_code_t> type_codes = {});
454
455 const SparseUnionType* union_type() const {
456 return internal::checked_cast<const SparseUnionType*>(union_type_);
457 }
458
459 protected:
460 void SetData(std::shared_ptr<ArrayData> data);
461 };
462
463 /// \brief Concrete Array class for dense union data
464 ///
465 /// Note that union types do not have a validity bitmap
466 class ARROW_EXPORT DenseUnionArray : public UnionArray {
467 public:
468 using TypeClass = DenseUnionType;
469
470 explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
471
472 DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
473 std::shared_ptr<Buffer> type_ids,
474 std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
475
476 /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
477 ///
478 /// This function does the bare minimum of validation of the offsets and
479 /// input types.
480 ///
481 /// \param[in] type_ids An array of logical type ids for the union type
482 /// \param[in] value_offsets An array of signed int32 values indicating the
483 /// relative offset into the respective child array for the type in a given slot.
484 /// The respective offsets for each child value array must be in order / increasing.
485 /// \param[in] children Vector of children Arrays containing the data for each type.
486 /// \param[in] type_codes Vector of type codes.
487 static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
488 const Array& value_offsets,
489 ArrayVector children,
490 std::vector<type_code_t> type_codes) {
491 return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
492 std::move(type_codes));
493 }
494
495 /// \brief Construct DenseUnionArray with custom field names from type_ids,
496 /// value_offsets, and children
497 ///
498 /// This function does the bare minimum of validation of the offsets and
499 /// input types.
500 ///
501 /// \param[in] type_ids An array of logical type ids for the union type
502 /// \param[in] value_offsets An array of signed int32 values indicating the
503 /// relative offset into the respective child array for the type in a given slot.
504 /// The respective offsets for each child value array must be in order / increasing.
505 /// \param[in] children Vector of children Arrays containing the data for each type.
506 /// \param[in] field_names Vector of strings containing the name of each field.
507 /// \param[in] type_codes Vector of type codes.
508 static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
509 const Array& value_offsets,
510 ArrayVector children,
511 std::vector<std::string> field_names = {},
512 std::vector<type_code_t> type_codes = {});
513
514 const DenseUnionType* union_type() const {
515 return internal::checked_cast<const DenseUnionType*>(union_type_);
516 }
517
518 /// Note that this buffer does not account for any slice offset
519 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
520
521 int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
522
523 const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
524
525 protected:
526 const int32_t* raw_value_offsets_;
527
528 void SetData(const std::shared_ptr<ArrayData>& data);
529 };
530
531 /// @}
532
533 } // namespace arrow