1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 // Array accessor classes for Binary, LargeBinart, String, LargeString,
28 #include "arrow/array/array_base.h"
29 #include "arrow/array/data.h"
30 #include "arrow/buffer.h"
31 #include "arrow/stl_iterator.h"
32 #include "arrow/type.h"
33 #include "arrow/util/checked_cast.h"
34 #include "arrow/util/macros.h"
35 #include "arrow/util/string_view.h" // IWYU pragma: export
36 #include "arrow/util/visibility.h"
40 /// \addtogroup binary-arrays
44 // ----------------------------------------------------------------------
47 /// Base class for variable-sized binary arrays, regardless of offset size
48 /// and logical interpretation.
49 template <typename TYPE
>
50 class BaseBinaryArray
: public FlatArray
{
52 using TypeClass
= TYPE
;
53 using offset_type
= typename
TypeClass::offset_type
;
54 using IteratorType
= stl::ArrayIterator
<BaseBinaryArray
<TYPE
>>;
56 /// Return the pointer to the given elements bytes
57 // XXX should GetValue(int64_t i) return a string_view?
58 const uint8_t* GetValue(int64_t i
, offset_type
* out_length
) const {
59 // Account for base offset
61 const offset_type pos
= raw_value_offsets_
[i
];
62 *out_length
= raw_value_offsets_
[i
+ 1] - pos
;
63 return raw_data_
+ pos
;
66 /// \brief Get binary value as a string_view
68 /// \param i the value index
69 /// \return the view over the selected value
70 util::string_view
GetView(int64_t i
) const {
71 // Account for base offset
73 const offset_type pos
= raw_value_offsets_
[i
];
74 return util::string_view(reinterpret_cast<const char*>(raw_data_
+ pos
),
75 raw_value_offsets_
[i
+ 1] - pos
);
78 /// \brief Get binary value as a string_view
79 /// Provided for consistency with other arrays.
81 /// \param i the value index
82 /// \return the view over the selected value
83 util::string_view
Value(int64_t i
) const { return GetView(i
); }
85 /// \brief Get binary value as a std::string
87 /// \param i the value index
88 /// \return the value copied into a std::string
89 std::string
GetString(int64_t i
) const { return std::string(GetView(i
)); }
91 /// Note that this buffer does not account for any slice offset
92 std::shared_ptr
<Buffer
> value_offsets() const { return data_
->buffers
[1]; }
94 /// Note that this buffer does not account for any slice offset
95 std::shared_ptr
<Buffer
> value_data() const { return data_
->buffers
[2]; }
97 const offset_type
* raw_value_offsets() const {
98 return raw_value_offsets_
+ data_
->offset
;
101 const uint8_t* raw_data() const { return raw_data_
; }
103 /// \brief Return the data buffer absolute offset of the data for the value
104 /// at the passed index.
106 /// Does not perform boundschecking
107 offset_type
value_offset(int64_t i
) const {
108 return raw_value_offsets_
[i
+ data_
->offset
];
111 /// \brief Return the length of the data for the value at the passed index.
113 /// Does not perform boundschecking
114 offset_type
value_length(int64_t i
) const {
116 return raw_value_offsets_
[i
+ 1] - raw_value_offsets_
[i
];
119 /// \brief Return the total length of the memory in the data buffer
120 /// referenced by this array. If the array has been sliced then this may be
121 /// less than the size of the data buffer (data_->buffers[2]).
122 offset_type
total_values_length() const {
123 if (data_
->length
> 0) {
124 return raw_value_offsets_
[data_
->length
+ data_
->offset
] -
125 raw_value_offsets_
[data_
->offset
];
131 IteratorType
begin() const { return IteratorType(*this); }
133 IteratorType
end() const { return IteratorType(*this, length()); }
137 BaseBinaryArray() = default;
139 // Protected method for constructors
140 void SetData(const std::shared_ptr
<ArrayData
>& data
) {
141 this->Array::SetData(data
);
142 raw_value_offsets_
= data
->GetValuesSafe
<offset_type
>(1, /*offset=*/0);
143 raw_data_
= data
->GetValuesSafe
<uint8_t>(2, /*offset=*/0);
146 const offset_type
* raw_value_offsets_
= NULLPTR
;
147 const uint8_t* raw_data_
= NULLPTR
;
150 /// Concrete Array class for variable-size binary data
151 class ARROW_EXPORT BinaryArray
: public BaseBinaryArray
<BinaryType
> {
153 explicit BinaryArray(const std::shared_ptr
<ArrayData
>& data
);
155 BinaryArray(int64_t length
, const std::shared_ptr
<Buffer
>& value_offsets
,
156 const std::shared_ptr
<Buffer
>& data
,
157 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
158 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
161 // For subclasses such as StringArray
162 BinaryArray() : BaseBinaryArray() {}
165 /// Concrete Array class for variable-size string (utf-8) data
166 class ARROW_EXPORT StringArray
: public BinaryArray
{
168 using TypeClass
= StringType
;
170 explicit StringArray(const std::shared_ptr
<ArrayData
>& data
);
172 StringArray(int64_t length
, const std::shared_ptr
<Buffer
>& value_offsets
,
173 const std::shared_ptr
<Buffer
>& data
,
174 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
175 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
177 /// \brief Validate that this array contains only valid UTF8 entries
179 /// This check is also implied by ValidateFull()
180 Status
ValidateUTF8() const;
183 /// Concrete Array class for large variable-size binary data
184 class ARROW_EXPORT LargeBinaryArray
: public BaseBinaryArray
<LargeBinaryType
> {
186 explicit LargeBinaryArray(const std::shared_ptr
<ArrayData
>& data
);
188 LargeBinaryArray(int64_t length
, const std::shared_ptr
<Buffer
>& value_offsets
,
189 const std::shared_ptr
<Buffer
>& data
,
190 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
191 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
194 // For subclasses such as LargeStringArray
195 LargeBinaryArray() : BaseBinaryArray() {}
198 /// Concrete Array class for large variable-size string (utf-8) data
199 class ARROW_EXPORT LargeStringArray
: public LargeBinaryArray
{
201 using TypeClass
= LargeStringType
;
203 explicit LargeStringArray(const std::shared_ptr
<ArrayData
>& data
);
205 LargeStringArray(int64_t length
, const std::shared_ptr
<Buffer
>& value_offsets
,
206 const std::shared_ptr
<Buffer
>& data
,
207 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
208 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
210 /// \brief Validate that this array contains only valid UTF8 entries
212 /// This check is also implied by ValidateFull()
213 Status
ValidateUTF8() const;
216 // ----------------------------------------------------------------------
217 // Fixed width binary
219 /// Concrete Array class for fixed-size binary data
220 class ARROW_EXPORT FixedSizeBinaryArray
: public PrimitiveArray
{
222 using TypeClass
= FixedSizeBinaryType
;
223 using IteratorType
= stl::ArrayIterator
<FixedSizeBinaryArray
>;
225 explicit FixedSizeBinaryArray(const std::shared_ptr
<ArrayData
>& data
);
227 FixedSizeBinaryArray(const std::shared_ptr
<DataType
>& type
, int64_t length
,
228 const std::shared_ptr
<Buffer
>& data
,
229 const std::shared_ptr
<Buffer
>& null_bitmap
= NULLPTR
,
230 int64_t null_count
= kUnknownNullCount
, int64_t offset
= 0);
232 const uint8_t* GetValue(int64_t i
) const;
233 const uint8_t* Value(int64_t i
) const { return GetValue(i
); }
235 util::string_view
GetView(int64_t i
) const {
236 return util::string_view(reinterpret_cast<const char*>(GetValue(i
)), byte_width());
239 std::string
GetString(int64_t i
) const { return std::string(GetView(i
)); }
241 int32_t byte_width() const { return byte_width_
; }
243 const uint8_t* raw_values() const { return raw_values_
+ data_
->offset
* byte_width_
; }
245 IteratorType
begin() const { return IteratorType(*this); }
247 IteratorType
end() const { return IteratorType(*this, length()); }
250 void SetData(const std::shared_ptr
<ArrayData
>& data
) {
251 this->PrimitiveArray::SetData(data
);
253 internal::checked_cast
<const FixedSizeBinaryType
&>(*type()).byte_width();