]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <algorithm> | |
21 | #include <cstddef> | |
22 | #include <memory> | |
23 | #include <sstream> | |
24 | #include <string> | |
25 | #include <tuple> | |
26 | #include <type_traits> | |
27 | #include <utility> | |
28 | #include <vector> | |
29 | ||
30 | #include "arrow/array.h" | |
31 | #include "arrow/array/builder_base.h" | |
32 | #include "arrow/array/builder_binary.h" | |
33 | #include "arrow/array/builder_nested.h" | |
34 | #include "arrow/array/builder_primitive.h" | |
35 | #include "arrow/chunked_array.h" | |
36 | #include "arrow/compute/api.h" | |
37 | #include "arrow/status.h" | |
38 | #include "arrow/table.h" | |
39 | #include "arrow/type_fwd.h" | |
40 | #include "arrow/type_traits.h" | |
41 | #include "arrow/util/checked_cast.h" | |
42 | #include "arrow/util/macros.h" | |
43 | ||
44 | namespace arrow { | |
45 | ||
46 | class Schema; | |
47 | ||
48 | namespace stl { | |
49 | ||
50 | namespace internal { | |
51 | ||
52 | template <typename T, typename = void> | |
53 | struct is_optional_like : public std::false_type {}; | |
54 | ||
55 | template <typename T, typename = void> | |
56 | struct is_dereferencable : public std::false_type {}; | |
57 | ||
58 | template <typename T> | |
59 | struct is_dereferencable<T, arrow::internal::void_t<decltype(*std::declval<T>())>> | |
60 | : public std::true_type {}; | |
61 | ||
62 | template <typename T> | |
63 | struct is_optional_like< | |
64 | T, typename std::enable_if< | |
65 | std::is_constructible<bool, T>::value && is_dereferencable<T>::value && | |
66 | !std::is_array<typename std::remove_reference<T>::type>::value>::type> | |
67 | : public std::true_type {}; | |
68 | ||
69 | template <size_t N, typename Tuple> | |
70 | using BareTupleElement = | |
71 | typename std::decay<typename std::tuple_element<N, Tuple>::type>::type; | |
72 | ||
73 | } // namespace internal | |
74 | ||
75 | template <typename T, typename R = void> | |
76 | using enable_if_optional_like = | |
77 | typename std::enable_if<internal::is_optional_like<T>::value, R>::type; | |
78 | ||
79 | /// Traits meta class to map standard C/C++ types to equivalent Arrow types. | |
80 | template <typename T, typename Enable = void> | |
81 | struct ConversionTraits {}; | |
82 | ||
83 | /// Returns builder type for given standard C/C++ type. | |
84 | template <typename CType> | |
85 | using CBuilderType = | |
86 | typename TypeTraits<typename ConversionTraits<CType>::ArrowType>::BuilderType; | |
87 | ||
88 | /// Default implementation of AppendListValues. | |
89 | /// | |
90 | /// This function can be specialized by user to take advantage of appending | |
91 | /// contiguous ranges while appending. This default implementation will call | |
92 | /// ConversionTraits<ValueCType>::AppendRow() for each value in the range. | |
93 | template <typename ValueCType, typename Range> | |
94 | inline Status AppendListValues(CBuilderType<ValueCType>& value_builder, | |
95 | Range&& cell_range) { | |
96 | for (auto const& value : cell_range) { | |
97 | ARROW_RETURN_NOT_OK(ConversionTraits<ValueCType>::AppendRow(value_builder, value)); | |
98 | } | |
99 | return Status::OK(); | |
100 | } | |
101 | ||
102 | #define ARROW_STL_CONVERSION(CType_, ArrowType_) \ | |
103 | template <> \ | |
104 | struct ConversionTraits<CType_> : public CTypeTraits<CType_> { \ | |
105 | static Status AppendRow(typename TypeTraits<ArrowType_>::BuilderType& builder, \ | |
106 | CType_ cell) { \ | |
107 | return builder.Append(cell); \ | |
108 | } \ | |
109 | static CType_ GetEntry(const typename TypeTraits<ArrowType_>::ArrayType& array, \ | |
110 | size_t j) { \ | |
111 | return array.Value(j); \ | |
112 | } \ | |
113 | }; \ | |
114 | \ | |
115 | template <> \ | |
116 | inline Status AppendListValues<CType_, const std::vector<CType_>&>( \ | |
117 | typename TypeTraits<ArrowType_>::BuilderType & value_builder, \ | |
118 | const std::vector<CType_>& cell_range) { \ | |
119 | return value_builder.AppendValues(cell_range); \ | |
120 | } | |
121 | ||
122 | ARROW_STL_CONVERSION(bool, BooleanType) | |
123 | ARROW_STL_CONVERSION(int8_t, Int8Type) | |
124 | ARROW_STL_CONVERSION(int16_t, Int16Type) | |
125 | ARROW_STL_CONVERSION(int32_t, Int32Type) | |
126 | ARROW_STL_CONVERSION(int64_t, Int64Type) | |
127 | ARROW_STL_CONVERSION(uint8_t, UInt8Type) | |
128 | ARROW_STL_CONVERSION(uint16_t, UInt16Type) | |
129 | ARROW_STL_CONVERSION(uint32_t, UInt32Type) | |
130 | ARROW_STL_CONVERSION(uint64_t, UInt64Type) | |
131 | ARROW_STL_CONVERSION(float, FloatType) | |
132 | ARROW_STL_CONVERSION(double, DoubleType) | |
133 | ||
134 | template <> | |
135 | struct ConversionTraits<std::string> : public CTypeTraits<std::string> { | |
136 | static Status AppendRow(StringBuilder& builder, const std::string& cell) { | |
137 | return builder.Append(cell); | |
138 | } | |
139 | static std::string GetEntry(const StringArray& array, size_t j) { | |
140 | return array.GetString(j); | |
141 | } | |
142 | }; | |
143 | ||
144 | /// Append cell range elements as a single value to the list builder. | |
145 | /// | |
146 | /// Cell range will be added to child builder using AppendListValues<ValueCType>() | |
147 | /// if provided. AppendListValues<ValueCType>() has a default implementation, but | |
148 | /// it can be specialized by users. | |
149 | template <typename ValueCType, typename ListBuilderType, typename Range> | |
150 | Status AppendCellRange(ListBuilderType& builder, Range&& cell_range) { | |
151 | constexpr bool is_list_builder = std::is_same<ListBuilderType, ListBuilder>::value; | |
152 | constexpr bool is_large_list_builder = | |
153 | std::is_same<ListBuilderType, LargeListBuilder>::value; | |
154 | static_assert( | |
155 | is_list_builder || is_large_list_builder, | |
156 | "Builder type must be either ListBuilder or LargeListBuilder for appending " | |
157 | "multiple rows."); | |
158 | ||
159 | using ChildBuilderType = CBuilderType<ValueCType>; | |
160 | ARROW_RETURN_NOT_OK(builder.Append()); | |
161 | auto& value_builder = | |
162 | ::arrow::internal::checked_cast<ChildBuilderType&>(*builder.value_builder()); | |
163 | ||
164 | // XXX: Remove appended value before returning if status isn't OK? | |
165 | return AppendListValues<ValueCType>(value_builder, std::forward<Range>(cell_range)); | |
166 | } | |
167 | ||
168 | template <typename ValueCType> | |
169 | struct ConversionTraits<std::vector<ValueCType>> | |
170 | : public CTypeTraits<std::vector<ValueCType>> { | |
171 | static Status AppendRow(ListBuilder& builder, const std::vector<ValueCType>& cell) { | |
172 | return AppendCellRange<ValueCType>(builder, cell); | |
173 | } | |
174 | ||
175 | static std::vector<ValueCType> GetEntry(const ListArray& array, size_t j) { | |
176 | using ElementArrayType = | |
177 | typename TypeTraits<typename ConversionTraits<ValueCType>::ArrowType>::ArrayType; | |
178 | ||
179 | const ElementArrayType& value_array = | |
180 | ::arrow::internal::checked_cast<const ElementArrayType&>(*array.values()); | |
181 | ||
182 | std::vector<ValueCType> vec(array.value_length(j)); | |
183 | for (int64_t i = 0; i < array.value_length(j); i++) { | |
184 | vec[i] = | |
185 | ConversionTraits<ValueCType>::GetEntry(value_array, array.value_offset(j) + i); | |
186 | } | |
187 | return vec; | |
188 | } | |
189 | }; | |
190 | ||
191 | template <typename Optional> | |
192 | struct ConversionTraits<Optional, enable_if_optional_like<Optional>> | |
193 | : public CTypeTraits<typename std::decay<decltype(*std::declval<Optional>())>::type> { | |
194 | using OptionalInnerType = | |
195 | typename std::decay<decltype(*std::declval<Optional>())>::type; | |
196 | using typename CTypeTraits<OptionalInnerType>::ArrowType; | |
197 | using CTypeTraits<OptionalInnerType>::type_singleton; | |
198 | ||
199 | static Status AppendRow(typename TypeTraits<ArrowType>::BuilderType& builder, | |
200 | const Optional& cell) { | |
201 | if (cell) { | |
202 | return ConversionTraits<OptionalInnerType>::AppendRow(builder, *cell); | |
203 | } else { | |
204 | return builder.AppendNull(); | |
205 | } | |
206 | } | |
207 | }; | |
208 | ||
209 | /// Build an arrow::Schema based upon the types defined in a std::tuple-like structure. | |
210 | /// | |
211 | /// While the type information is available at compile-time, we still need to add the | |
212 | /// column names at runtime, thus these methods are not constexpr. | |
213 | template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value> | |
214 | struct SchemaFromTuple { | |
215 | using Element = internal::BareTupleElement<N - 1, Tuple>; | |
216 | ||
217 | // Implementations that take a vector-like object for the column names. | |
218 | ||
219 | /// Recursively build a vector of arrow::Field from the defined types. | |
220 | /// | |
221 | /// In most cases MakeSchema is the better entrypoint for the Schema creation. | |
222 | static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion( | |
223 | const std::vector<std::string>& names) { | |
224 | std::vector<std::shared_ptr<Field>> ret = | |
225 | SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursion(names); | |
226 | auto type = ConversionTraits<Element>::type_singleton(); | |
227 | ret.push_back(field(names[N - 1], type, internal::is_optional_like<Element>::value)); | |
228 | return ret; | |
229 | } | |
230 | ||
231 | /// Build a Schema from the types of the tuple-like structure passed in as template | |
232 | /// parameter assign the column names at runtime. | |
233 | /// | |
234 | /// An example usage of this API can look like the following: | |
235 | /// | |
236 | /// \code{.cpp} | |
237 | /// using TupleType = std::tuple<int, std::vector<std::string>>; | |
238 | /// std::shared_ptr<Schema> schema = | |
239 | /// SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"}); | |
240 | /// \endcode | |
241 | static std::shared_ptr<Schema> MakeSchema(const std::vector<std::string>& names) { | |
242 | return std::make_shared<Schema>(MakeSchemaRecursion(names)); | |
243 | } | |
244 | ||
245 | // Implementations that take a tuple-like object for the column names. | |
246 | ||
247 | /// Recursively build a vector of arrow::Field from the defined types. | |
248 | /// | |
249 | /// In most cases MakeSchema is the better entrypoint for the Schema creation. | |
250 | template <typename NamesTuple> | |
251 | static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT( | |
252 | const NamesTuple& names) { | |
253 | using std::get; | |
254 | ||
255 | std::vector<std::shared_ptr<Field>> ret = | |
256 | SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursionT(names); | |
257 | std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton(); | |
258 | ret.push_back( | |
259 | field(get<N - 1>(names), type, internal::is_optional_like<Element>::value)); | |
260 | return ret; | |
261 | } | |
262 | ||
263 | /// Build a Schema from the types of the tuple-like structure passed in as template | |
264 | /// parameter assign the column names at runtime. | |
265 | /// | |
266 | /// An example usage of this API can look like the following: | |
267 | /// | |
268 | /// \code{.cpp} | |
269 | /// using TupleType = std::tuple<int, std::vector<std::string>>; | |
270 | /// std::shared_ptr<Schema> schema = | |
271 | /// SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"}); | |
272 | /// \endcode | |
273 | template <typename NamesTuple> | |
274 | static std::shared_ptr<Schema> MakeSchema(const NamesTuple& names) { | |
275 | return std::make_shared<Schema>(MakeSchemaRecursionT<NamesTuple>(names)); | |
276 | } | |
277 | }; | |
278 | ||
279 | template <typename Tuple> | |
280 | struct SchemaFromTuple<Tuple, 0> { | |
281 | static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion( | |
282 | const std::vector<std::string>& names) { | |
283 | std::vector<std::shared_ptr<Field>> ret; | |
284 | ret.reserve(names.size()); | |
285 | return ret; | |
286 | } | |
287 | ||
288 | template <typename NamesTuple> | |
289 | static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT( | |
290 | const NamesTuple& names) { | |
291 | std::vector<std::shared_ptr<Field>> ret; | |
292 | ret.reserve(std::tuple_size<NamesTuple>::value); | |
293 | return ret; | |
294 | } | |
295 | }; | |
296 | ||
297 | namespace internal { | |
298 | ||
299 | template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value> | |
300 | struct CreateBuildersRecursive { | |
301 | static Status Make(MemoryPool* pool, | |
302 | std::vector<std::unique_ptr<ArrayBuilder>>* builders) { | |
303 | using Element = BareTupleElement<N - 1, Tuple>; | |
304 | std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton(); | |
305 | ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &builders->at(N - 1))); | |
306 | ||
307 | return CreateBuildersRecursive<Tuple, N - 1>::Make(pool, builders); | |
308 | } | |
309 | }; | |
310 | ||
311 | template <typename Tuple> | |
312 | struct CreateBuildersRecursive<Tuple, 0> { | |
313 | static Status Make(MemoryPool*, std::vector<std::unique_ptr<ArrayBuilder>>*) { | |
314 | return Status::OK(); | |
315 | } | |
316 | }; | |
317 | ||
318 | template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value> | |
319 | struct RowIterator { | |
320 | static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders, | |
321 | const Tuple& row) { | |
322 | using std::get; | |
323 | using Element = BareTupleElement<N - 1, Tuple>; | |
324 | using BuilderType = | |
325 | typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::BuilderType; | |
326 | ||
327 | BuilderType& builder = | |
328 | ::arrow::internal::checked_cast<BuilderType&>(*builders[N - 1]); | |
329 | ARROW_RETURN_NOT_OK(ConversionTraits<Element>::AppendRow(builder, get<N - 1>(row))); | |
330 | ||
331 | return RowIterator<Tuple, N - 1>::Append(builders, row); | |
332 | } | |
333 | }; | |
334 | ||
335 | template <typename Tuple> | |
336 | struct RowIterator<Tuple, 0> { | |
337 | static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders, | |
338 | const Tuple& row) { | |
339 | return Status::OK(); | |
340 | } | |
341 | }; | |
342 | ||
343 | template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value> | |
344 | struct EnsureColumnTypes { | |
345 | static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner, | |
346 | const compute::CastOptions& cast_options, compute::ExecContext* ctx, | |
347 | std::reference_wrapper<const ::arrow::Table>* result) { | |
348 | using Element = BareTupleElement<N - 1, Tuple>; | |
349 | std::shared_ptr<DataType> expected_type = ConversionTraits<Element>::type_singleton(); | |
350 | ||
351 | if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) { | |
352 | ARROW_ASSIGN_OR_RAISE( | |
353 | Datum casted, | |
354 | compute::Cast(table.column(N - 1), expected_type, cast_options, ctx)); | |
355 | auto new_field = table.schema()->field(N - 1)->WithType(expected_type); | |
356 | ARROW_ASSIGN_OR_RAISE(*table_owner, | |
357 | table.SetColumn(N - 1, new_field, casted.chunked_array())); | |
358 | *result = **table_owner; | |
359 | } | |
360 | ||
361 | return EnsureColumnTypes<Tuple, N - 1>::Cast(result->get(), table_owner, cast_options, | |
362 | ctx, result); | |
363 | } | |
364 | }; | |
365 | ||
366 | template <typename Tuple> | |
367 | struct EnsureColumnTypes<Tuple, 0> { | |
368 | static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner, | |
369 | const compute::CastOptions& cast_options, compute::ExecContext* ctx, | |
370 | std::reference_wrapper<const ::arrow::Table>* result) { | |
371 | return Status::OK(); | |
372 | } | |
373 | }; | |
374 | ||
375 | template <typename Range, typename Tuple, std::size_t N = std::tuple_size<Tuple>::value> | |
376 | struct TupleSetter { | |
377 | static void Fill(const Table& table, Range* rows) { | |
378 | using std::get; | |
379 | using Element = typename std::tuple_element<N - 1, Tuple>::type; | |
380 | using ArrayType = | |
381 | typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::ArrayType; | |
382 | ||
383 | auto iter = rows->begin(); | |
384 | const ChunkedArray& chunked_array = *table.column(N - 1); | |
385 | for (int i = 0; i < chunked_array.num_chunks(); i++) { | |
386 | const ArrayType& array = | |
387 | ::arrow::internal::checked_cast<const ArrayType&>(*chunked_array.chunk(i)); | |
388 | for (int64_t j = 0; j < array.length(); j++) { | |
389 | get<N - 1>(*iter++) = ConversionTraits<Element>::GetEntry(array, j); | |
390 | } | |
391 | } | |
392 | ||
393 | return TupleSetter<Range, Tuple, N - 1>::Fill(table, rows); | |
394 | } | |
395 | }; | |
396 | ||
397 | template <typename Range, typename Tuple> | |
398 | struct TupleSetter<Range, Tuple, 0> { | |
399 | static void Fill(const Table& table, Range* rows) {} | |
400 | }; | |
401 | ||
402 | } // namespace internal | |
403 | ||
404 | template <typename Range> | |
405 | Status TableFromTupleRange(MemoryPool* pool, Range&& rows, | |
406 | const std::vector<std::string>& names, | |
407 | std::shared_ptr<Table>* table) { | |
408 | using row_type = typename std::iterator_traits<decltype(std::begin(rows))>::value_type; | |
409 | constexpr std::size_t n_columns = std::tuple_size<row_type>::value; | |
410 | ||
411 | std::shared_ptr<Schema> schema = SchemaFromTuple<row_type>::MakeSchema(names); | |
412 | ||
413 | std::vector<std::unique_ptr<ArrayBuilder>> builders(n_columns); | |
414 | ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive<row_type>::Make(pool, &builders)); | |
415 | ||
416 | for (auto const& row : rows) { | |
417 | ARROW_RETURN_NOT_OK(internal::RowIterator<row_type>::Append(builders, row)); | |
418 | } | |
419 | ||
420 | std::vector<std::shared_ptr<Array>> arrays; | |
421 | for (auto const& builder : builders) { | |
422 | std::shared_ptr<Array> array; | |
423 | ARROW_RETURN_NOT_OK(builder->Finish(&array)); | |
424 | arrays.emplace_back(array); | |
425 | } | |
426 | ||
427 | *table = Table::Make(std::move(schema), std::move(arrays)); | |
428 | ||
429 | return Status::OK(); | |
430 | } | |
431 | ||
432 | template <typename Range> | |
433 | Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options, | |
434 | compute::ExecContext* ctx, Range* rows) { | |
435 | using row_type = typename std::decay<decltype(*std::begin(*rows))>::type; | |
436 | constexpr std::size_t n_columns = std::tuple_size<row_type>::value; | |
437 | ||
438 | if (table.schema()->num_fields() != n_columns) { | |
439 | std::stringstream ss; | |
440 | ss << "Number of columns in the table does not match the width of the target: "; | |
441 | ss << table.schema()->num_fields() << " != " << n_columns; | |
442 | return Status::Invalid(ss.str()); | |
443 | } | |
444 | ||
445 | // TODO: Use std::size with C++17 | |
446 | if (rows->size() != static_cast<size_t>(table.num_rows())) { | |
447 | std::stringstream ss; | |
448 | ss << "Number of rows in the table does not match the size of the target: "; | |
449 | ss << table.num_rows() << " != " << rows->size(); | |
450 | return Status::Invalid(ss.str()); | |
451 | } | |
452 | ||
453 | // Check that all columns have the correct type, otherwise cast them. | |
454 | std::shared_ptr<Table> table_owner; | |
455 | std::reference_wrapper<const ::arrow::Table> current_table(table); | |
456 | ||
457 | ARROW_RETURN_NOT_OK(internal::EnsureColumnTypes<row_type>::Cast( | |
458 | table, &table_owner, cast_options, ctx, ¤t_table)); | |
459 | ||
460 | internal::TupleSetter<Range, row_type>::Fill(current_table.get(), rows); | |
461 | ||
462 | return Status::OK(); | |
463 | } | |
464 | ||
465 | } // namespace stl | |
466 | } // namespace arrow |