1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "arrow/python/python_to_arrow.h"
19 #include "arrow/python/numpy_interop.h"
30 #include "arrow/array.h"
31 #include "arrow/array/builder_binary.h"
32 #include "arrow/array/builder_decimal.h"
33 #include "arrow/array/builder_dict.h"
34 #include "arrow/array/builder_nested.h"
35 #include "arrow/array/builder_primitive.h"
36 #include "arrow/array/builder_time.h"
37 #include "arrow/chunked_array.h"
38 #include "arrow/status.h"
39 #include "arrow/type.h"
40 #include "arrow/type_traits.h"
41 #include "arrow/util/checked_cast.h"
42 #include "arrow/util/converter.h"
43 #include "arrow/util/decimal.h"
44 #include "arrow/util/int_util_internal.h"
45 #include "arrow/util/logging.h"
47 #include "arrow/python/datetime.h"
48 #include "arrow/python/decimal.h"
49 #include "arrow/python/helpers.h"
50 #include "arrow/python/inference.h"
51 #include "arrow/python/iterators.h"
52 #include "arrow/python/numpy_convert.h"
53 #include "arrow/python/type_traits.h"
54 #include "arrow/visitor_inline.h"
58 using internal::checked_cast
;
59 using internal::checked_pointer_cast
;
61 using internal::Converter
;
62 using internal::DictionaryConverter
;
63 using internal::ListConverter
;
64 using internal::PrimitiveConverter
;
65 using internal::StructConverter
;
67 using internal::MakeChunker
;
68 using internal::MakeConverter
;
73 enum class MonthDayNanoField
{ kMonths
, kWeeksAndDays
, kDaysOnly
, kNanoseconds
};
75 template <MonthDayNanoField field
>
76 struct MonthDayNanoTraits
;
78 struct MonthDayNanoAttrData
{
80 const int64_t multiplier
;
84 struct MonthDayNanoTraits
<MonthDayNanoField::kMonths
> {
85 using c_type
= int32_t;
86 static const MonthDayNanoAttrData attrs
[];
89 const MonthDayNanoAttrData MonthDayNanoTraits
<MonthDayNanoField::kMonths
>::attrs
[] = {
90 {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}};
93 struct MonthDayNanoTraits
<MonthDayNanoField::kWeeksAndDays
> {
94 using c_type
= int32_t;
95 static const MonthDayNanoAttrData attrs
[];
98 const MonthDayNanoAttrData MonthDayNanoTraits
<MonthDayNanoField::kWeeksAndDays
>::attrs
[] =
99 {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}};
102 struct MonthDayNanoTraits
<MonthDayNanoField::kDaysOnly
> {
103 using c_type
= int32_t;
104 static const MonthDayNanoAttrData attrs
[];
107 const MonthDayNanoAttrData MonthDayNanoTraits
<MonthDayNanoField::kDaysOnly
>::attrs
[] = {
108 {"days", 1}, {nullptr, 0}};
111 struct MonthDayNanoTraits
<MonthDayNanoField::kNanoseconds
> {
112 using c_type
= int64_t;
113 static const MonthDayNanoAttrData attrs
[];
116 const MonthDayNanoAttrData MonthDayNanoTraits
<MonthDayNanoField::kNanoseconds
>::attrs
[] =
118 {"minutes", /*minutes_in_hours=*/60},
119 {"seconds", /*seconds_in_minute=*/60},
120 {"milliseconds", /*milliseconds_in_seconds*/ 1000},
121 {"microseconds", /*microseconds_in_millseconds=*/1000},
122 {"nanoseconds", /*nanoseconds_in_microseconds=*/1000},
125 template <MonthDayNanoField field
>
126 struct PopulateMonthDayNano
{
127 using Traits
= MonthDayNanoTraits
<field
>;
128 using field_c_type
= typename
Traits::c_type
;
130 static Status
Field(PyObject
* obj
, field_c_type
* out
, bool* found_attrs
) {
132 for (const MonthDayNanoAttrData
* attr
= &Traits::attrs
[0]; attr
->multiplier
!= 0;
134 if (attr
->multiplier
!= 1 &&
135 ::arrow::internal::MultiplyWithOverflow(
136 static_cast<field_c_type
>(attr
->multiplier
), *out
, out
)) {
137 return Status::Invalid("Overflow on: ", (attr
- 1)->name
,
138 " for: ", internal::PyObject_StdStringRepr(obj
));
141 OwnedRef
field_value(PyObject_GetAttrString(obj
, attr
->name
));
142 if (field_value
.obj() == nullptr) {
143 // No attribute present, skip to the next one.
150 RETURN_NOT_OK(internal::CIntFromPython(field_value
.obj(), &value
, attr
->name
));
151 if (::arrow::internal::AddWithOverflow(*out
, value
, out
)) {
152 return Status::Invalid("Overflow on: ", attr
->name
,
153 " for: ", internal::PyObject_StdStringRepr(obj
));
161 // Utility for converting single python objects to their intermediate C representations
162 // which can be fed to the typed builders
165 // Type aliases for shorter signature definitions
167 using O
= PyConversionOptions
;
169 // Used for null checking before actually converting the values
170 static bool IsNull(const O
& options
, I obj
) {
171 if (options
.from_pandas
) {
172 return internal::PandasObjectIsNull(obj
);
174 return obj
== Py_None
;
178 // Used for post-conversion numpy NaT sentinel checking
179 static bool IsNaT(const TimestampType
*, int64_t value
) {
180 return internal::npy_traits
<NPY_DATETIME
>::isnull(value
);
183 // Used for post-conversion numpy NaT sentinel checking
184 static bool IsNaT(const DurationType
*, int64_t value
) {
185 return internal::npy_traits
<NPY_TIMEDELTA
>::isnull(value
);
188 static Result
<std::nullptr_t
> Convert(const NullType
*, const O
&, I obj
) {
189 if (obj
== Py_None
) {
192 return Status::Invalid("Invalid null value");
196 static Result
<bool> Convert(const BooleanType
*, const O
&, I obj
) {
197 if (obj
== Py_True
) {
199 } else if (obj
== Py_False
) {
201 } else if (PyArray_IsScalar(obj
, Bool
)) {
202 return reinterpret_cast<PyBoolScalarObject
*>(obj
)->obval
== NPY_TRUE
;
204 return internal::InvalidValue(obj
, "tried to convert to boolean");
208 template <typename T
>
209 static enable_if_integer
<T
, Result
<typename
T::c_type
>> Convert(const T
* type
, const O
&,
211 typename
T::c_type value
;
212 auto status
= internal::CIntFromPython(obj
, &value
);
213 if (ARROW_PREDICT_TRUE(status
.ok())) {
215 } else if (!internal::PyIntScalar_Check(obj
)) {
216 std::stringstream ss
;
217 ss
<< "tried to convert to " << type
->ToString();
218 return internal::InvalidValue(obj
, ss
.str());
224 static Result
<uint16_t> Convert(const HalfFloatType
*, const O
&, I obj
) {
226 RETURN_NOT_OK(PyFloat_AsHalf(obj
, &value
));
230 static Result
<float> Convert(const FloatType
*, const O
&, I obj
) {
232 if (internal::PyFloatScalar_Check(obj
)) {
233 value
= static_cast<float>(PyFloat_AsDouble(obj
));
235 } else if (internal::PyIntScalar_Check(obj
)) {
236 RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj
, &value
));
238 return internal::InvalidValue(obj
, "tried to convert to float32");
243 static Result
<double> Convert(const DoubleType
*, const O
&, I obj
) {
245 if (PyFloat_Check(obj
)) {
246 value
= PyFloat_AS_DOUBLE(obj
);
247 } else if (internal::PyFloatScalar_Check(obj
)) {
248 // Other kinds of float-y things
249 value
= PyFloat_AsDouble(obj
);
251 } else if (internal::PyIntScalar_Check(obj
)) {
252 RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj
, &value
));
254 return internal::InvalidValue(obj
, "tried to convert to double");
259 static Result
<Decimal128
> Convert(const Decimal128Type
* type
, const O
&, I obj
) {
261 RETURN_NOT_OK(internal::DecimalFromPyObject(obj
, *type
, &value
));
265 static Result
<Decimal256
> Convert(const Decimal256Type
* type
, const O
&, I obj
) {
267 RETURN_NOT_OK(internal::DecimalFromPyObject(obj
, *type
, &value
));
271 static Result
<int32_t> Convert(const Date32Type
*, const O
&, I obj
) {
273 if (PyDate_Check(obj
)) {
274 auto pydate
= reinterpret_cast<PyDateTime_Date
*>(obj
);
275 value
= static_cast<int32_t>(internal::PyDate_to_days(pydate
));
278 internal::CIntFromPython(obj
, &value
, "Integer too large for date32"));
283 static Result
<int64_t> Convert(const Date64Type
*, const O
&, I obj
) {
285 if (PyDateTime_Check(obj
)) {
286 auto pydate
= reinterpret_cast<PyDateTime_DateTime
*>(obj
);
287 value
= internal::PyDateTime_to_ms(pydate
);
288 // Truncate any intraday milliseconds
289 // TODO: introduce an option for this
290 value
-= value
% 86400000LL;
291 } else if (PyDate_Check(obj
)) {
292 auto pydate
= reinterpret_cast<PyDateTime_Date
*>(obj
);
293 value
= internal::PyDate_to_ms(pydate
);
296 internal::CIntFromPython(obj
, &value
, "Integer too large for date64"));
301 static Result
<int32_t> Convert(const Time32Type
* type
, const O
&, I obj
) {
303 if (PyTime_Check(obj
)) {
304 switch (type
->unit()) {
305 case TimeUnit::SECOND
:
306 value
= static_cast<int32_t>(internal::PyTime_to_s(obj
));
308 case TimeUnit::MILLI
:
309 value
= static_cast<int32_t>(internal::PyTime_to_ms(obj
));
312 return Status::UnknownError("Invalid time unit");
315 RETURN_NOT_OK(internal::CIntFromPython(obj
, &value
, "Integer too large for int32"));
320 static Result
<int64_t> Convert(const Time64Type
* type
, const O
&, I obj
) {
322 if (PyTime_Check(obj
)) {
323 switch (type
->unit()) {
324 case TimeUnit::MICRO
:
325 value
= internal::PyTime_to_us(obj
);
328 value
= internal::PyTime_to_ns(obj
);
331 return Status::UnknownError("Invalid time unit");
334 RETURN_NOT_OK(internal::CIntFromPython(obj
, &value
, "Integer too large for int64"));
339 static Result
<int64_t> Convert(const TimestampType
* type
, const O
& options
, I obj
) {
340 int64_t value
, offset
;
341 if (PyDateTime_Check(obj
)) {
342 if (ARROW_PREDICT_FALSE(options
.ignore_timezone
)) {
345 ARROW_ASSIGN_OR_RAISE(offset
, internal::PyDateTime_utcoffset_s(obj
));
347 auto dt
= reinterpret_cast<PyDateTime_DateTime
*>(obj
);
348 switch (type
->unit()) {
349 case TimeUnit::SECOND
:
350 value
= internal::PyDateTime_to_s(dt
) - offset
;
352 case TimeUnit::MILLI
:
353 value
= internal::PyDateTime_to_ms(dt
) - offset
* 1000LL;
355 case TimeUnit::MICRO
:
356 value
= internal::PyDateTime_to_us(dt
) - offset
* 1000000LL;
359 if (internal::IsPandasTimestamp(obj
)) {
360 // pd.Timestamp value attribute contains the offset from unix epoch
361 // so no adjustment for timezone is need.
362 OwnedRef
nanos(PyObject_GetAttrString(obj
, "value"));
364 RETURN_NOT_OK(internal::CIntFromPython(nanos
.obj(), &value
));
366 // Conversion to nanoseconds can overflow -> check multiply of microseconds
367 value
= internal::PyDateTime_to_us(dt
);
368 if (arrow::internal::MultiplyWithOverflow(value
, 1000LL, &value
)) {
369 return internal::InvalidValue(obj
,
370 "out of bounds for nanosecond resolution");
373 // Adjust with offset and check for overflow
374 if (arrow::internal::SubtractWithOverflow(value
, offset
* 1000000000LL,
376 return internal::InvalidValue(obj
,
377 "out of bounds for nanosecond resolution");
382 return Status::UnknownError("Invalid time unit");
384 } else if (PyArray_CheckAnyScalarExact(obj
)) {
385 // validate that the numpy scalar has np.datetime64 dtype
386 std::shared_ptr
<DataType
> numpy_type
;
387 RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj
), &numpy_type
));
388 if (!numpy_type
->Equals(*type
)) {
389 return Status::NotImplemented("Expected np.datetime64 but got: ",
390 numpy_type
->ToString());
392 return reinterpret_cast<PyDatetimeScalarObject
*>(obj
)->obval
;
394 RETURN_NOT_OK(internal::CIntFromPython(obj
, &value
));
399 static Result
<MonthDayNanoIntervalType::MonthDayNanos
> Convert(
400 const MonthDayNanoIntervalType
* /*type*/, const O
& /*options*/, I obj
) {
401 MonthDayNanoIntervalType::MonthDayNanos output
;
402 bool found_attrs
= false;
403 RETURN_NOT_OK(PopulateMonthDayNano
<MonthDayNanoField::kMonths
>::Field(
404 obj
, &output
.months
, &found_attrs
));
405 // on relativeoffset weeks is a property calculated from days. On
406 // DateOffset is is a field on its own. timedelta doesn't have a weeks
408 PyObject
* pandas_date_offset_type
= internal::BorrowPandasDataOffsetType();
409 bool is_date_offset
= pandas_date_offset_type
== (PyObject
*)Py_TYPE(obj
);
410 if (!is_date_offset
) {
411 RETURN_NOT_OK(PopulateMonthDayNano
<MonthDayNanoField::kDaysOnly
>::Field(
412 obj
, &output
.days
, &found_attrs
));
414 RETURN_NOT_OK(PopulateMonthDayNano
<MonthDayNanoField::kWeeksAndDays
>::Field(
415 obj
, &output
.days
, &found_attrs
));
417 RETURN_NOT_OK(PopulateMonthDayNano
<MonthDayNanoField::kNanoseconds
>::Field(
418 obj
, &output
.nanoseconds
, &found_attrs
));
420 if (ARROW_PREDICT_FALSE(!found_attrs
) && !is_date_offset
) {
421 // date_offset can have zero fields.
422 return Status::TypeError("No temporal attributes found on object.");
427 static Result
<int64_t> Convert(const DurationType
* type
, const O
&, I obj
) {
429 if (PyDelta_Check(obj
)) {
430 auto dt
= reinterpret_cast<PyDateTime_Delta
*>(obj
);
431 switch (type
->unit()) {
432 case TimeUnit::SECOND
:
433 value
= internal::PyDelta_to_s(dt
);
435 case TimeUnit::MILLI
:
436 value
= internal::PyDelta_to_ms(dt
);
438 case TimeUnit::MICRO
:
439 value
= internal::PyDelta_to_us(dt
);
442 if (internal::IsPandasTimedelta(obj
)) {
443 OwnedRef
nanos(PyObject_GetAttrString(obj
, "value"));
445 RETURN_NOT_OK(internal::CIntFromPython(nanos
.obj(), &value
));
447 value
= internal::PyDelta_to_ns(dt
);
451 return Status::UnknownError("Invalid time unit");
453 } else if (PyArray_CheckAnyScalarExact(obj
)) {
454 // validate that the numpy scalar has np.datetime64 dtype
455 std::shared_ptr
<DataType
> numpy_type
;
456 RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj
), &numpy_type
));
457 if (!numpy_type
->Equals(*type
)) {
458 return Status::NotImplemented("Expected np.timedelta64 but got: ",
459 numpy_type
->ToString());
461 return reinterpret_cast<PyTimedeltaScalarObject
*>(obj
)->obval
;
463 RETURN_NOT_OK(internal::CIntFromPython(obj
, &value
));
468 // The binary-like intermediate representation is PyBytesView because it keeps temporary
469 // python objects alive (non-contiguous memoryview) and stores whether the original
470 // object was unicode encoded or not, which is used for unicode -> bytes coersion if
471 // there is a non-unicode object observed.
473 static Status
Convert(const BaseBinaryType
*, const O
&, I obj
, PyBytesView
& view
) {
474 return view
.ParseString(obj
);
477 static Status
Convert(const FixedSizeBinaryType
* type
, const O
&, I obj
,
479 ARROW_RETURN_NOT_OK(view
.ParseString(obj
));
480 if (view
.size
!= type
->byte_width()) {
481 std::stringstream ss
;
482 ss
<< "expected to be length " << type
->byte_width() << " was " << view
.size
;
483 return internal::InvalidValue(obj
, ss
.str());
489 template <typename T
>
490 static enable_if_string
<T
, Status
> Convert(const T
*, const O
& options
, I obj
,
492 if (options
.strict
) {
493 // Strict conversion, force output to be unicode / utf8 and validate that
494 // any binary values are utf8
495 ARROW_RETURN_NOT_OK(view
.ParseString(obj
, true));
497 return internal::InvalidValue(obj
, "was not a utf8 string");
501 // Non-strict conversion; keep track of whether values are unicode or bytes
502 return view
.ParseString(obj
);
506 static Result
<bool> Convert(const DataType
* type
, const O
&, I obj
) {
507 return Status::NotImplemented("PyValue::Convert is not implemented for type ", type
);
511 // The base Converter class is a mixin with predefined behavior and constructors.
512 class PyConverter
: public Converter
<PyObject
*, PyConversionOptions
> {
514 // Iterate over the input values and defer the conversion to the Append method
515 Status
Extend(PyObject
* values
, int64_t size
, int64_t offset
= 0) override
{
516 DCHECK_GE(size
, offset
);
517 /// Ensure we've allocated enough space
518 RETURN_NOT_OK(this->Reserve(size
- offset
));
519 // Iterate over the items adding each one
520 return internal::VisitSequence(
522 [this](PyObject
* item
, bool* /* unused */) { return this->Append(item
); });
525 // Convert and append a sequence of values masked with a numpy array
526 Status
ExtendMasked(PyObject
* values
, PyObject
* mask
, int64_t size
,
527 int64_t offset
= 0) override
{
528 DCHECK_GE(size
, offset
);
529 /// Ensure we've allocated enough space
530 RETURN_NOT_OK(this->Reserve(size
- offset
));
531 // Iterate over the items adding each one
532 return internal::VisitSequenceMasked(
533 values
, mask
, offset
, [this](PyObject
* item
, bool is_masked
, bool* /* unused */) {
535 return this->AppendNull();
537 // This will also apply the null-checking convention in the event
538 // that the value is not masked
539 return this->Append(item
); // perhaps use AppendValue instead?
545 template <typename T
, typename Enable
= void>
546 class PyPrimitiveConverter
;
548 template <typename T
>
549 class PyListConverter
;
551 template <typename U
, typename Enable
= void>
552 class PyDictionaryConverter
;
554 class PyStructConverter
;
556 template <typename T
, typename Enable
= void>
557 struct PyConverterTrait
;
559 template <typename T
>
560 struct PyConverterTrait
<
561 T
, enable_if_t
<(!is_nested_type
<T
>::value
&& !is_interval_type
<T
>::value
&&
562 !is_extension_type
<T
>::value
) ||
563 std::is_same
<T
, MonthDayNanoIntervalType
>::value
>> {
564 using type
= PyPrimitiveConverter
<T
>;
567 template <typename T
>
568 struct PyConverterTrait
<T
, enable_if_list_like
<T
>> {
569 using type
= PyListConverter
<T
>;
573 struct PyConverterTrait
<StructType
> {
574 using type
= PyStructConverter
;
578 struct PyConverterTrait
<DictionaryType
> {
579 template <typename T
>
580 using dictionary_type
= PyDictionaryConverter
<T
>;
583 template <typename T
>
584 class PyPrimitiveConverter
<T
, enable_if_null
<T
>>
585 : public PrimitiveConverter
<T
, PyConverter
> {
587 Status
Append(PyObject
* value
) override
{
588 if (PyValue::IsNull(this->options_
, value
)) {
589 return this->primitive_builder_
->AppendNull();
591 ARROW_ASSIGN_OR_RAISE(
592 auto converted
, PyValue::Convert(this->primitive_type_
, this->options_
, value
));
593 return this->primitive_builder_
->Append(converted
);
598 template <typename T
>
599 class PyPrimitiveConverter
<
600 T
, enable_if_t
<is_boolean_type
<T
>::value
|| is_number_type
<T
>::value
||
601 is_decimal_type
<T
>::value
|| is_date_type
<T
>::value
||
602 is_time_type
<T
>::value
||
603 std::is_same
<MonthDayNanoIntervalType
, T
>::value
>>
604 : public PrimitiveConverter
<T
, PyConverter
> {
606 Status
Append(PyObject
* value
) override
{
607 // Since the required space has been already allocated in the Extend functions we can
608 // rely on the Unsafe builder API which improves the performance.
609 if (PyValue::IsNull(this->options_
, value
)) {
610 this->primitive_builder_
->UnsafeAppendNull();
612 ARROW_ASSIGN_OR_RAISE(
613 auto converted
, PyValue::Convert(this->primitive_type_
, this->options_
, value
));
614 this->primitive_builder_
->UnsafeAppend(converted
);
620 template <typename T
>
621 class PyPrimitiveConverter
<
622 T
, enable_if_t
<is_timestamp_type
<T
>::value
|| is_duration_type
<T
>::value
>>
623 : public PrimitiveConverter
<T
, PyConverter
> {
625 Status
Append(PyObject
* value
) override
{
626 if (PyValue::IsNull(this->options_
, value
)) {
627 this->primitive_builder_
->UnsafeAppendNull();
629 ARROW_ASSIGN_OR_RAISE(
630 auto converted
, PyValue::Convert(this->primitive_type_
, this->options_
, value
));
631 // Numpy NaT sentinels can be checked after the conversion
632 if (PyArray_CheckAnyScalarExact(value
) &&
633 PyValue::IsNaT(this->primitive_type_
, converted
)) {
634 this->primitive_builder_
->UnsafeAppendNull();
636 this->primitive_builder_
->UnsafeAppend(converted
);
643 template <typename T
>
644 class PyPrimitiveConverter
<T
, enable_if_t
<std::is_same
<T
, FixedSizeBinaryType
>::value
>>
645 : public PrimitiveConverter
<T
, PyConverter
> {
647 Status
Append(PyObject
* value
) override
{
648 if (PyValue::IsNull(this->options_
, value
)) {
649 this->primitive_builder_
->UnsafeAppendNull();
652 PyValue::Convert(this->primitive_type_
, this->options_
, value
, view_
));
653 ARROW_RETURN_NOT_OK(this->primitive_builder_
->ReserveData(view_
.size
));
654 this->primitive_builder_
->UnsafeAppend(view_
.bytes
);
663 template <typename T
>
664 class PyPrimitiveConverter
<T
, enable_if_base_binary
<T
>>
665 : public PrimitiveConverter
<T
, PyConverter
> {
667 using OffsetType
= typename
T::offset_type
;
669 Status
Append(PyObject
* value
) override
{
670 if (PyValue::IsNull(this->options_
, value
)) {
671 this->primitive_builder_
->UnsafeAppendNull();
674 PyValue::Convert(this->primitive_type_
, this->options_
, value
, view_
));
675 if (!view_
.is_utf8
) {
676 // observed binary value
677 observed_binary_
= true;
679 // Since we don't know the varying length input size in advance, we need to
680 // reserve space in the value builder one by one. ReserveData raises CapacityError
681 // if the value would not fit into the array.
682 ARROW_RETURN_NOT_OK(this->primitive_builder_
->ReserveData(view_
.size
));
683 this->primitive_builder_
->UnsafeAppend(view_
.bytes
,
684 static_cast<OffsetType
>(view_
.size
));
689 Result
<std::shared_ptr
<Array
>> ToArray() override
{
690 ARROW_ASSIGN_OR_RAISE(auto array
, (PrimitiveConverter
<T
, PyConverter
>::ToArray()));
691 if (observed_binary_
) {
692 // if we saw any non-unicode, cast results to BinaryArray
693 auto binary_type
= TypeTraits
<typename
T::PhysicalType
>::type_singleton();
694 return array
->View(binary_type
);
702 bool observed_binary_
= false;
705 template <typename U
>
706 class PyDictionaryConverter
<U
, enable_if_has_c_type
<U
>>
707 : public DictionaryConverter
<U
, PyConverter
> {
709 Status
Append(PyObject
* value
) override
{
710 if (PyValue::IsNull(this->options_
, value
)) {
711 return this->value_builder_
->AppendNull();
713 ARROW_ASSIGN_OR_RAISE(auto converted
,
714 PyValue::Convert(this->value_type_
, this->options_
, value
));
715 return this->value_builder_
->Append(converted
);
720 template <typename U
>
721 class PyDictionaryConverter
<U
, enable_if_has_string_view
<U
>>
722 : public DictionaryConverter
<U
, PyConverter
> {
724 Status
Append(PyObject
* value
) override
{
725 if (PyValue::IsNull(this->options_
, value
)) {
726 return this->value_builder_
->AppendNull();
729 PyValue::Convert(this->value_type_
, this->options_
, value
, view_
));
730 return this->value_builder_
->Append(view_
.bytes
, static_cast<int32_t>(view_
.size
));
738 template <typename T
>
739 class PyListConverter
: public ListConverter
<T
, PyConverter
, PyConverterTrait
> {
741 Status
Append(PyObject
* value
) override
{
742 if (PyValue::IsNull(this->options_
, value
)) {
743 return this->list_builder_
->AppendNull();
746 RETURN_NOT_OK(this->list_builder_
->Append());
747 if (PyArray_Check(value
)) {
748 RETURN_NOT_OK(AppendNdarray(value
));
749 } else if (PySequence_Check(value
)) {
750 RETURN_NOT_OK(AppendSequence(value
));
751 } else if (PySet_Check(value
) || (Py_TYPE(value
) == &PyDictValues_Type
)) {
752 RETURN_NOT_OK(AppendIterable(value
));
754 return internal::InvalidType(
755 value
, "was not a sequence or recognized null for conversion to list type");
758 return ValidateBuilder(this->list_type_
);
762 Status
ValidateBuilder(const MapType
*) {
763 if (this->list_builder_
->key_builder()->null_count() > 0) {
764 return Status::Invalid("Invalid Map: key field can not contain null values");
770 Status
ValidateBuilder(const BaseListType
*) { return Status::OK(); }
772 Status
AppendSequence(PyObject
* value
) {
773 int64_t size
= static_cast<int64_t>(PySequence_Size(value
));
774 RETURN_NOT_OK(this->list_builder_
->ValidateOverflow(size
));
775 return this->value_converter_
->Extend(value
, size
);
778 Status
AppendIterable(PyObject
* value
) {
779 PyObject
* iterator
= PyObject_GetIter(value
);
780 OwnedRef
iter_ref(iterator
);
781 while (PyObject
* item
= PyIter_Next(iterator
)) {
782 OwnedRef
item_ref(item
);
783 RETURN_NOT_OK(this->value_converter_
->Reserve(1));
784 RETURN_NOT_OK(this->value_converter_
->Append(item
));
789 Status
AppendNdarray(PyObject
* value
) {
790 PyArrayObject
* ndarray
= reinterpret_cast<PyArrayObject
*>(value
);
791 if (PyArray_NDIM(ndarray
) != 1) {
792 return Status::Invalid("Can only convert 1-dimensional array values");
794 const int64_t size
= PyArray_SIZE(ndarray
);
795 RETURN_NOT_OK(this->list_builder_
->ValidateOverflow(size
));
797 const auto value_type
= this->value_converter_
->builder()->type();
798 switch (value_type
->id()) {
799 // If the value type does not match the expected NumPy dtype, then fall through
800 // to a slower PySequence-based path
801 #define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \
802 case Type::TYPE_ID: { \
803 if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \
804 return this->value_converter_->Extend(value, size); \
806 return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \
808 LIST_FAST_CASE(BOOL
, BooleanType
, NPY_BOOL
)
809 LIST_FAST_CASE(UINT8
, UInt8Type
, NPY_UINT8
)
810 LIST_FAST_CASE(INT8
, Int8Type
, NPY_INT8
)
811 LIST_FAST_CASE(UINT16
, UInt16Type
, NPY_UINT16
)
812 LIST_FAST_CASE(INT16
, Int16Type
, NPY_INT16
)
813 LIST_FAST_CASE(UINT32
, UInt32Type
, NPY_UINT32
)
814 LIST_FAST_CASE(INT32
, Int32Type
, NPY_INT32
)
815 LIST_FAST_CASE(UINT64
, UInt64Type
, NPY_UINT64
)
816 LIST_FAST_CASE(INT64
, Int64Type
, NPY_INT64
)
817 LIST_FAST_CASE(HALF_FLOAT
, HalfFloatType
, NPY_FLOAT16
)
818 LIST_FAST_CASE(FLOAT
, FloatType
, NPY_FLOAT
)
819 LIST_FAST_CASE(DOUBLE
, DoubleType
, NPY_DOUBLE
)
820 LIST_FAST_CASE(TIMESTAMP
, TimestampType
, NPY_DATETIME
)
821 LIST_FAST_CASE(DURATION
, DurationType
, NPY_TIMEDELTA
)
822 #undef LIST_FAST_CASE
824 return this->value_converter_
->Extend(value
, size
);
829 template <typename ArrowType
, int NUMPY_TYPE
>
830 Status
AppendNdarrayTyped(PyArrayObject
* ndarray
) {
831 // no need to go through the conversion
832 using NumpyTrait
= internal::npy_traits
<NUMPY_TYPE
>;
833 using NumpyType
= typename
NumpyTrait::value_type
;
834 using ValueBuilderType
= typename TypeTraits
<ArrowType
>::BuilderType
;
836 const bool null_sentinels_possible
=
837 // Always treat Numpy's NaT as null
838 NUMPY_TYPE
== NPY_DATETIME
|| NUMPY_TYPE
== NPY_TIMEDELTA
||
839 // Observing pandas's null sentinels
840 (this->options_
.from_pandas
&& NumpyTrait::supports_nulls
);
843 checked_cast
<ValueBuilderType
*>(this->value_converter_
->builder().get());
845 Ndarray1DIndexer
<NumpyType
> values(ndarray
);
846 if (null_sentinels_possible
) {
847 for (int64_t i
= 0; i
< values
.size(); ++i
) {
848 if (NumpyTrait::isnull(values
[i
])) {
849 RETURN_NOT_OK(value_builder
->AppendNull());
851 RETURN_NOT_OK(value_builder
->Append(values
[i
]));
854 } else if (!values
.is_strided()) {
855 RETURN_NOT_OK(value_builder
->AppendValues(values
.data(), values
.size()));
857 for (int64_t i
= 0; i
< values
.size(); ++i
) {
858 RETURN_NOT_OK(value_builder
->Append(values
[i
]));
865 class PyStructConverter
: public StructConverter
<PyConverter
, PyConverterTrait
> {
867 Status
Append(PyObject
* value
) override
{
868 if (PyValue::IsNull(this->options_
, value
)) {
869 return this->struct_builder_
->AppendNull();
871 switch (input_kind_
) {
872 case InputKind::DICT
:
873 RETURN_NOT_OK(this->struct_builder_
->Append());
874 return AppendDict(value
);
875 case InputKind::TUPLE
:
876 RETURN_NOT_OK(this->struct_builder_
->Append());
877 return AppendTuple(value
);
878 case InputKind::ITEMS
:
879 RETURN_NOT_OK(this->struct_builder_
->Append());
880 return AppendItems(value
);
882 RETURN_NOT_OK(InferInputKind(value
));
883 return Append(value
);
888 Status
Init(MemoryPool
* pool
) override
{
889 RETURN_NOT_OK((StructConverter
<PyConverter
, PyConverterTrait
>::Init(pool
)));
891 // Store the field names as a PyObjects for dict matching
892 num_fields_
= this->struct_type_
->num_fields();
893 bytes_field_names_
.reset(PyList_New(num_fields_
));
894 unicode_field_names_
.reset(PyList_New(num_fields_
));
897 for (int i
= 0; i
< num_fields_
; i
++) {
898 const auto& field_name
= this->struct_type_
->field(i
)->name();
899 PyObject
* bytes
= PyBytes_FromStringAndSize(field_name
.c_str(), field_name
.size());
901 PyUnicode_FromStringAndSize(field_name
.c_str(), field_name
.size());
903 PyList_SET_ITEM(bytes_field_names_
.obj(), i
, bytes
);
904 PyList_SET_ITEM(unicode_field_names_
.obj(), i
, unicode
);
909 Status
InferInputKind(PyObject
* value
) {
910 // Infer input object's type, note that heterogeneous sequences are not allowed
911 if (PyDict_Check(value
)) {
912 input_kind_
= InputKind::DICT
;
913 } else if (PyTuple_Check(value
)) {
914 input_kind_
= InputKind::TUPLE
;
915 } else if (PySequence_Check(value
)) {
916 input_kind_
= InputKind::ITEMS
;
918 return internal::InvalidType(value
,
919 "was not a dict, tuple, or recognized null value "
920 "for conversion to struct type");
925 Status
InferKeyKind(PyObject
* items
) {
926 for (int i
= 0; i
< PySequence_Length(items
); i
++) {
927 // retrieve the key from the passed key-value pairs
928 ARROW_ASSIGN_OR_RAISE(auto pair
, GetKeyValuePair(items
, i
));
930 // check key exists between the unicode field names
931 bool do_contain
= PySequence_Contains(unicode_field_names_
.obj(), pair
.first
);
934 key_kind_
= KeyKind::UNICODE
;
938 // check key exists between the bytes field names
939 do_contain
= PySequence_Contains(bytes_field_names_
.obj(), pair
.first
);
942 key_kind_
= KeyKind::BYTES
;
949 Status
AppendEmpty() {
950 for (int i
= 0; i
< num_fields_
; i
++) {
951 RETURN_NOT_OK(this->children_
[i
]->Append(Py_None
));
956 Status
AppendTuple(PyObject
* tuple
) {
957 if (!PyTuple_Check(tuple
)) {
958 return internal::InvalidType(tuple
, "was expecting a tuple");
960 if (PyTuple_GET_SIZE(tuple
) != num_fields_
) {
961 return Status::Invalid("Tuple size must be equal to number of struct fields");
963 for (int i
= 0; i
< num_fields_
; i
++) {
964 PyObject
* value
= PyTuple_GET_ITEM(tuple
, i
);
965 RETURN_NOT_OK(this->children_
[i
]->Append(value
));
970 Status
AppendDict(PyObject
* dict
) {
971 if (!PyDict_Check(dict
)) {
972 return internal::InvalidType(dict
, "was expecting a dict");
975 case KeyKind::UNICODE
:
976 return AppendDict(dict
, unicode_field_names_
.obj());
978 return AppendDict(dict
, bytes_field_names_
.obj());
980 RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict
)));
981 if (key_kind_
== KeyKind::UNKNOWN
) {
982 // was unable to infer the type which means that all keys are absent
983 return AppendEmpty();
985 return AppendDict(dict
);
990 Status
AppendItems(PyObject
* items
) {
991 if (!PySequence_Check(items
)) {
992 return internal::InvalidType(items
, "was expecting a sequence of key-value items");
995 case KeyKind::UNICODE
:
996 return AppendItems(items
, unicode_field_names_
.obj());
998 return AppendItems(items
, bytes_field_names_
.obj());
1000 RETURN_NOT_OK(InferKeyKind(items
));
1001 if (key_kind_
== KeyKind::UNKNOWN
) {
1002 // was unable to infer the type which means that all keys are absent
1003 return AppendEmpty();
1005 return AppendItems(items
);
1010 Status
AppendDict(PyObject
* dict
, PyObject
* field_names
) {
1011 // NOTE we're ignoring any extraneous dict items
1012 for (int i
= 0; i
< num_fields_
; i
++) {
1013 PyObject
* name
= PyList_GET_ITEM(field_names
, i
); // borrowed
1014 PyObject
* value
= PyDict_GetItem(dict
, name
); // borrowed
1015 if (value
== NULL
) {
1016 RETURN_IF_PYERROR();
1018 RETURN_NOT_OK(this->children_
[i
]->Append(value
? value
: Py_None
));
1020 return Status::OK();
1023 Result
<std::pair
<PyObject
*, PyObject
*>> GetKeyValuePair(PyObject
* seq
, int index
) {
1024 PyObject
* pair
= PySequence_GetItem(seq
, index
);
1025 RETURN_IF_PYERROR();
1026 if (!PyTuple_Check(pair
) || PyTuple_Size(pair
) != 2) {
1027 return internal::InvalidType(pair
, "was expecting tuple of (key, value) pair");
1029 PyObject
* key
= PyTuple_GetItem(pair
, 0);
1030 RETURN_IF_PYERROR();
1031 PyObject
* value
= PyTuple_GetItem(pair
, 1);
1032 RETURN_IF_PYERROR();
1033 return std::make_pair(key
, value
);
1036 Status
AppendItems(PyObject
* items
, PyObject
* field_names
) {
1037 auto length
= static_cast<int>(PySequence_Size(items
));
1038 RETURN_IF_PYERROR();
1040 // append the values for the defined fields
1041 for (int i
= 0; i
< std::min(num_fields_
, length
); i
++) {
1042 // retrieve the key-value pair
1043 ARROW_ASSIGN_OR_RAISE(auto pair
, GetKeyValuePair(items
, i
));
1045 // validate that the key and the field name are equal
1046 PyObject
* name
= PyList_GET_ITEM(field_names
, i
);
1047 bool are_equal
= PyObject_RichCompareBool(pair
.first
, name
, Py_EQ
);
1048 RETURN_IF_PYERROR();
1050 // finally append to the respective child builder
1052 RETURN_NOT_OK(this->children_
[i
]->Append(pair
.second
));
1054 ARROW_ASSIGN_OR_RAISE(auto key_view
, PyBytesView::FromString(pair
.first
));
1055 ARROW_ASSIGN_OR_RAISE(auto name_view
, PyBytesView::FromString(name
));
1056 return Status::Invalid("The expected field name is `", name_view
.bytes
, "` but `",
1057 key_view
.bytes
, "` was given");
1060 // insert null values for missing fields
1061 for (int i
= length
; i
< num_fields_
; i
++) {
1062 RETURN_NOT_OK(this->children_
[i
]->AppendNull());
1064 return Status::OK();
1067 // Whether we're converting from a sequence of dicts or tuples or list of pairs
1068 enum class InputKind
{ UNKNOWN
, DICT
, TUPLE
, ITEMS
} input_kind_
= InputKind::UNKNOWN
;
1069 // Whether the input dictionary keys' type is python bytes or unicode
1070 enum class KeyKind
{ UNKNOWN
, BYTES
, UNICODE
} key_kind_
= KeyKind::UNKNOWN
;
1071 // Store the field names as a PyObjects for dict matching
1072 OwnedRef bytes_field_names_
;
1073 OwnedRef unicode_field_names_
;
1074 // Store the number of fields for later reuse
1078 // Convert *obj* to a sequence if necessary
1079 // Fill *size* to its length. If >= 0 on entry, *size* is an upper size
1080 // bound that may lead to truncation.
1081 Status
ConvertToSequenceAndInferSize(PyObject
* obj
, PyObject
** seq
, int64_t* size
) {
1082 if (PySequence_Check(obj
)) {
1083 // obj is already a sequence
1084 int64_t real_size
= static_cast<int64_t>(PySequence_Size(obj
));
1088 *size
= std::min(real_size
, *size
);
1092 } else if (*size
< 0) {
1093 // unknown size, exhaust iterator
1094 *seq
= PySequence_List(obj
);
1095 RETURN_IF_PYERROR();
1096 *size
= static_cast<int64_t>(PyList_GET_SIZE(*seq
));
1098 // size is known but iterator could be infinite
1099 Py_ssize_t i
, n
= *size
;
1100 PyObject
* iter
= PyObject_GetIter(obj
);
1101 RETURN_IF_PYERROR();
1102 OwnedRef
iter_ref(iter
);
1103 PyObject
* lst
= PyList_New(n
);
1104 RETURN_IF_PYERROR();
1105 for (i
= 0; i
< n
; i
++) {
1106 PyObject
* item
= PyIter_Next(iter
);
1108 PyList_SET_ITEM(lst
, i
, item
);
1110 // Shrink list if len(iterator) < size
1111 if (i
< n
&& PyList_SetSlice(lst
, i
, n
, NULL
)) {
1113 return Status::UnknownError("failed to resize list");
1116 *size
= std::min
<int64_t>(i
, *size
);
1118 return Status::OK();
1123 Result
<std::shared_ptr
<ChunkedArray
>> ConvertPySequence(PyObject
* obj
, PyObject
* mask
,
1124 PyConversionOptions options
,
1129 OwnedRef tmp_seq_nanny
;
1131 ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported
, internal::IsModuleImported("pandas"));
1132 if (is_pandas_imported
) {
1133 // If pandas has been already imported initialize the static pandas objects to
1134 // support converting from pd.Timedelta and pd.Timestamp objects
1135 internal::InitPandasStaticData();
1138 int64_t size
= options
.size
;
1139 RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj
, &seq
, &size
));
1140 tmp_seq_nanny
.reset(seq
);
1142 // In some cases, type inference may be "loose", like strings. If the user
1143 // passed pa.string(), then we will error if we encounter any non-UTF8
1144 // value. If not, then we will allow the result to be a BinaryArray
1145 if (options
.type
== nullptr) {
1146 ARROW_ASSIGN_OR_RAISE(options
.type
, InferArrowType(seq
, mask
, options
.from_pandas
));
1147 options
.strict
= false;
1149 options
.strict
= true;
1153 ARROW_ASSIGN_OR_RAISE(auto converter
, (MakeConverter
<PyConverter
, PyConverterTrait
>(
1154 options
.type
, options
, pool
)));
1155 if (converter
->may_overflow()) {
1156 // The converter hierarchy contains binary- or list-like builders which can overflow
1157 // depending on the input values. Wrap the converter with a chunker which detects
1158 // the overflow and automatically creates new chunks.
1159 ARROW_ASSIGN_OR_RAISE(auto chunked_converter
, MakeChunker(std::move(converter
)));
1160 if (mask
!= nullptr && mask
!= Py_None
) {
1161 RETURN_NOT_OK(chunked_converter
->ExtendMasked(seq
, mask
, size
));
1163 RETURN_NOT_OK(chunked_converter
->Extend(seq
, size
));
1165 return chunked_converter
->ToChunkedArray();
1167 // If the converter can't overflow spare the capacity error checking on the hot-path,
1168 // this improves the performance roughly by ~10% for primitive types.
1169 if (mask
!= nullptr && mask
!= Py_None
) {
1170 RETURN_NOT_OK(converter
->ExtendMasked(seq
, mask
, size
));
1172 RETURN_NOT_OK(converter
->Extend(seq
, size
));
1174 return converter
->ToChunkedArray();
1179 } // namespace arrow