]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/array/array_binary.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / array / array_binary.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // Array accessor classes for Binary, LargeBinart, String, LargeString,
19 // FixedSizeBinary
20
21 #pragma once
22
23 #include <cstdint>
24 #include <memory>
25 #include <string>
26 #include <vector>
27
28 #include "arrow/array/array_base.h"
29 #include "arrow/array/data.h"
30 #include "arrow/buffer.h"
31 #include "arrow/stl_iterator.h"
32 #include "arrow/type.h"
33 #include "arrow/util/checked_cast.h"
34 #include "arrow/util/macros.h"
35 #include "arrow/util/string_view.h" // IWYU pragma: export
36 #include "arrow/util/visibility.h"
37
38 namespace arrow {
39
40 /// \addtogroup binary-arrays
41 ///
42 /// @{
43
44 // ----------------------------------------------------------------------
45 // Binary and String
46
47 /// Base class for variable-sized binary arrays, regardless of offset size
48 /// and logical interpretation.
49 template <typename TYPE>
50 class BaseBinaryArray : public FlatArray {
51 public:
52 using TypeClass = TYPE;
53 using offset_type = typename TypeClass::offset_type;
54 using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
55
56 /// Return the pointer to the given elements bytes
57 // XXX should GetValue(int64_t i) return a string_view?
58 const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
59 // Account for base offset
60 i += data_->offset;
61 const offset_type pos = raw_value_offsets_[i];
62 *out_length = raw_value_offsets_[i + 1] - pos;
63 return raw_data_ + pos;
64 }
65
66 /// \brief Get binary value as a string_view
67 ///
68 /// \param i the value index
69 /// \return the view over the selected value
70 util::string_view GetView(int64_t i) const {
71 // Account for base offset
72 i += data_->offset;
73 const offset_type pos = raw_value_offsets_[i];
74 return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
75 raw_value_offsets_[i + 1] - pos);
76 }
77
78 /// \brief Get binary value as a string_view
79 /// Provided for consistency with other arrays.
80 ///
81 /// \param i the value index
82 /// \return the view over the selected value
83 util::string_view Value(int64_t i) const { return GetView(i); }
84
85 /// \brief Get binary value as a std::string
86 ///
87 /// \param i the value index
88 /// \return the value copied into a std::string
89 std::string GetString(int64_t i) const { return std::string(GetView(i)); }
90
91 /// Note that this buffer does not account for any slice offset
92 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
93
94 /// Note that this buffer does not account for any slice offset
95 std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
96
97 const offset_type* raw_value_offsets() const {
98 return raw_value_offsets_ + data_->offset;
99 }
100
101 const uint8_t* raw_data() const { return raw_data_; }
102
103 /// \brief Return the data buffer absolute offset of the data for the value
104 /// at the passed index.
105 ///
106 /// Does not perform boundschecking
107 offset_type value_offset(int64_t i) const {
108 return raw_value_offsets_[i + data_->offset];
109 }
110
111 /// \brief Return the length of the data for the value at the passed index.
112 ///
113 /// Does not perform boundschecking
114 offset_type value_length(int64_t i) const {
115 i += data_->offset;
116 return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
117 }
118
119 /// \brief Return the total length of the memory in the data buffer
120 /// referenced by this array. If the array has been sliced then this may be
121 /// less than the size of the data buffer (data_->buffers[2]).
122 offset_type total_values_length() const {
123 if (data_->length > 0) {
124 return raw_value_offsets_[data_->length + data_->offset] -
125 raw_value_offsets_[data_->offset];
126 } else {
127 return 0;
128 }
129 }
130
131 IteratorType begin() const { return IteratorType(*this); }
132
133 IteratorType end() const { return IteratorType(*this, length()); }
134
135 protected:
136 // For subclasses
137 BaseBinaryArray() = default;
138
139 // Protected method for constructors
140 void SetData(const std::shared_ptr<ArrayData>& data) {
141 this->Array::SetData(data);
142 raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
143 raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
144 }
145
146 const offset_type* raw_value_offsets_ = NULLPTR;
147 const uint8_t* raw_data_ = NULLPTR;
148 };
149
150 /// Concrete Array class for variable-size binary data
151 class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
152 public:
153 explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
154
155 BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
156 const std::shared_ptr<Buffer>& data,
157 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
158 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
159
160 protected:
161 // For subclasses such as StringArray
162 BinaryArray() : BaseBinaryArray() {}
163 };
164
165 /// Concrete Array class for variable-size string (utf-8) data
166 class ARROW_EXPORT StringArray : public BinaryArray {
167 public:
168 using TypeClass = StringType;
169
170 explicit StringArray(const std::shared_ptr<ArrayData>& data);
171
172 StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
173 const std::shared_ptr<Buffer>& data,
174 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
175 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
176
177 /// \brief Validate that this array contains only valid UTF8 entries
178 ///
179 /// This check is also implied by ValidateFull()
180 Status ValidateUTF8() const;
181 };
182
183 /// Concrete Array class for large variable-size binary data
184 class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
185 public:
186 explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
187
188 LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
189 const std::shared_ptr<Buffer>& data,
190 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
191 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
192
193 protected:
194 // For subclasses such as LargeStringArray
195 LargeBinaryArray() : BaseBinaryArray() {}
196 };
197
198 /// Concrete Array class for large variable-size string (utf-8) data
199 class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
200 public:
201 using TypeClass = LargeStringType;
202
203 explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
204
205 LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
206 const std::shared_ptr<Buffer>& data,
207 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
208 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
209
210 /// \brief Validate that this array contains only valid UTF8 entries
211 ///
212 /// This check is also implied by ValidateFull()
213 Status ValidateUTF8() const;
214 };
215
216 // ----------------------------------------------------------------------
217 // Fixed width binary
218
219 /// Concrete Array class for fixed-size binary data
220 class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
221 public:
222 using TypeClass = FixedSizeBinaryType;
223 using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
224
225 explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
226
227 FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
228 const std::shared_ptr<Buffer>& data,
229 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
230 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
231
232 const uint8_t* GetValue(int64_t i) const;
233 const uint8_t* Value(int64_t i) const { return GetValue(i); }
234
235 util::string_view GetView(int64_t i) const {
236 return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
237 }
238
239 std::string GetString(int64_t i) const { return std::string(GetView(i)); }
240
241 int32_t byte_width() const { return byte_width_; }
242
243 const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
244
245 IteratorType begin() const { return IteratorType(*this); }
246
247 IteratorType end() const { return IteratorType(*this, length()); }
248
249 protected:
250 void SetData(const std::shared_ptr<ArrayData>& data) {
251 this->PrimitiveArray::SetData(data);
252 byte_width_ =
253 internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
254 }
255
256 int32_t byte_width_;
257 };
258
259 /// @}
260
261 } // namespace arrow