]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/python/python_to_arrow.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / python / python_to_arrow.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/python/python_to_arrow.h"
19#include "arrow/python/numpy_interop.h"
20
21#include <datetime.h>
22
23#include <algorithm>
24#include <limits>
25#include <sstream>
26#include <string>
27#include <utility>
28#include <vector>
29
30#include "arrow/array.h"
31#include "arrow/array/builder_binary.h"
32#include "arrow/array/builder_decimal.h"
33#include "arrow/array/builder_dict.h"
34#include "arrow/array/builder_nested.h"
35#include "arrow/array/builder_primitive.h"
36#include "arrow/array/builder_time.h"
37#include "arrow/chunked_array.h"
38#include "arrow/status.h"
39#include "arrow/type.h"
40#include "arrow/type_traits.h"
41#include "arrow/util/checked_cast.h"
42#include "arrow/util/converter.h"
43#include "arrow/util/decimal.h"
44#include "arrow/util/int_util_internal.h"
45#include "arrow/util/logging.h"
46
47#include "arrow/python/datetime.h"
48#include "arrow/python/decimal.h"
49#include "arrow/python/helpers.h"
50#include "arrow/python/inference.h"
51#include "arrow/python/iterators.h"
52#include "arrow/python/numpy_convert.h"
53#include "arrow/python/type_traits.h"
54#include "arrow/visitor_inline.h"
55
56namespace arrow {
57
58using internal::checked_cast;
59using internal::checked_pointer_cast;
60
61using internal::Converter;
62using internal::DictionaryConverter;
63using internal::ListConverter;
64using internal::PrimitiveConverter;
65using internal::StructConverter;
66
67using internal::MakeChunker;
68using internal::MakeConverter;
69
70namespace py {
71
72namespace {
73enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds };
74
75template <MonthDayNanoField field>
76struct MonthDayNanoTraits;
77
78struct MonthDayNanoAttrData {
79 const char* name;
80 const int64_t multiplier;
81};
82
83template <>
84struct MonthDayNanoTraits<MonthDayNanoField::kMonths> {
85 using c_type = int32_t;
86 static const MonthDayNanoAttrData attrs[];
87};
88
89const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kMonths>::attrs[] = {
90 {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}};
91
92template <>
93struct MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays> {
94 using c_type = int32_t;
95 static const MonthDayNanoAttrData attrs[];
96};
97
98const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays>::attrs[] =
99 {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}};
100
101template <>
102struct MonthDayNanoTraits<MonthDayNanoField::kDaysOnly> {
103 using c_type = int32_t;
104 static const MonthDayNanoAttrData attrs[];
105};
106
107const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kDaysOnly>::attrs[] = {
108 {"days", 1}, {nullptr, 0}};
109
110template <>
111struct MonthDayNanoTraits<MonthDayNanoField::kNanoseconds> {
112 using c_type = int64_t;
113 static const MonthDayNanoAttrData attrs[];
114};
115
116const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kNanoseconds>::attrs[] =
117 {{"hours", 1},
118 {"minutes", /*minutes_in_hours=*/60},
119 {"seconds", /*seconds_in_minute=*/60},
120 {"milliseconds", /*milliseconds_in_seconds*/ 1000},
121 {"microseconds", /*microseconds_in_millseconds=*/1000},
122 {"nanoseconds", /*nanoseconds_in_microseconds=*/1000},
123 {nullptr, 0}};
124
125template <MonthDayNanoField field>
126struct PopulateMonthDayNano {
127 using Traits = MonthDayNanoTraits<field>;
128 using field_c_type = typename Traits::c_type;
129
130 static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) {
131 *out = 0;
132 for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0;
133 ++attr) {
134 if (attr->multiplier != 1 &&
135 ::arrow::internal::MultiplyWithOverflow(
136 static_cast<field_c_type>(attr->multiplier), *out, out)) {
137 return Status::Invalid("Overflow on: ", (attr - 1)->name,
138 " for: ", internal::PyObject_StdStringRepr(obj));
139 }
140
141 OwnedRef field_value(PyObject_GetAttrString(obj, attr->name));
142 if (field_value.obj() == nullptr) {
143 // No attribute present, skip to the next one.
144 PyErr_Clear();
145 continue;
146 }
147 RETURN_IF_PYERROR();
148 *found_attrs = true;
149 field_c_type value;
150 RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name));
151 if (::arrow::internal::AddWithOverflow(*out, value, out)) {
152 return Status::Invalid("Overflow on: ", attr->name,
153 " for: ", internal::PyObject_StdStringRepr(obj));
154 }
155 }
156
157 return Status::OK();
158 }
159};
160
161// Utility for converting single python objects to their intermediate C representations
162// which can be fed to the typed builders
163class PyValue {
164 public:
165 // Type aliases for shorter signature definitions
166 using I = PyObject*;
167 using O = PyConversionOptions;
168
169 // Used for null checking before actually converting the values
170 static bool IsNull(const O& options, I obj) {
171 if (options.from_pandas) {
172 return internal::PandasObjectIsNull(obj);
173 } else {
174 return obj == Py_None;
175 }
176 }
177
178 // Used for post-conversion numpy NaT sentinel checking
179 static bool IsNaT(const TimestampType*, int64_t value) {
180 return internal::npy_traits<NPY_DATETIME>::isnull(value);
181 }
182
183 // Used for post-conversion numpy NaT sentinel checking
184 static bool IsNaT(const DurationType*, int64_t value) {
185 return internal::npy_traits<NPY_TIMEDELTA>::isnull(value);
186 }
187
188 static Result<std::nullptr_t> Convert(const NullType*, const O&, I obj) {
189 if (obj == Py_None) {
190 return nullptr;
191 } else {
192 return Status::Invalid("Invalid null value");
193 }
194 }
195
196 static Result<bool> Convert(const BooleanType*, const O&, I obj) {
197 if (obj == Py_True) {
198 return true;
199 } else if (obj == Py_False) {
200 return false;
201 } else if (PyArray_IsScalar(obj, Bool)) {
202 return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
203 } else {
204 return internal::InvalidValue(obj, "tried to convert to boolean");
205 }
206 }
207
208 template <typename T>
209 static enable_if_integer<T, Result<typename T::c_type>> Convert(const T* type, const O&,
210 I obj) {
211 typename T::c_type value;
212 auto status = internal::CIntFromPython(obj, &value);
213 if (ARROW_PREDICT_TRUE(status.ok())) {
214 return value;
215 } else if (!internal::PyIntScalar_Check(obj)) {
216 std::stringstream ss;
217 ss << "tried to convert to " << type->ToString();
218 return internal::InvalidValue(obj, ss.str());
219 } else {
220 return status;
221 }
222 }
223
224 static Result<uint16_t> Convert(const HalfFloatType*, const O&, I obj) {
225 uint16_t value;
226 RETURN_NOT_OK(PyFloat_AsHalf(obj, &value));
227 return value;
228 }
229
230 static Result<float> Convert(const FloatType*, const O&, I obj) {
231 float value;
232 if (internal::PyFloatScalar_Check(obj)) {
233 value = static_cast<float>(PyFloat_AsDouble(obj));
234 RETURN_IF_PYERROR();
235 } else if (internal::PyIntScalar_Check(obj)) {
236 RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value));
237 } else {
238 return internal::InvalidValue(obj, "tried to convert to float32");
239 }
240 return value;
241 }
242
243 static Result<double> Convert(const DoubleType*, const O&, I obj) {
244 double value;
245 if (PyFloat_Check(obj)) {
246 value = PyFloat_AS_DOUBLE(obj);
247 } else if (internal::PyFloatScalar_Check(obj)) {
248 // Other kinds of float-y things
249 value = PyFloat_AsDouble(obj);
250 RETURN_IF_PYERROR();
251 } else if (internal::PyIntScalar_Check(obj)) {
252 RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value));
253 } else {
254 return internal::InvalidValue(obj, "tried to convert to double");
255 }
256 return value;
257 }
258
259 static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
260 Decimal128 value;
261 RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
262 return value;
263 }
264
265 static Result<Decimal256> Convert(const Decimal256Type* type, const O&, I obj) {
266 Decimal256 value;
267 RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
268 return value;
269 }
270
271 static Result<int32_t> Convert(const Date32Type*, const O&, I obj) {
272 int32_t value;
273 if (PyDate_Check(obj)) {
274 auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
275 value = static_cast<int32_t>(internal::PyDate_to_days(pydate));
276 } else {
277 RETURN_NOT_OK(
278 internal::CIntFromPython(obj, &value, "Integer too large for date32"));
279 }
280 return value;
281 }
282
283 static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
284 int64_t value;
285 if (PyDateTime_Check(obj)) {
286 auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
287 value = internal::PyDateTime_to_ms(pydate);
288 // Truncate any intraday milliseconds
289 // TODO: introduce an option for this
290 value -= value % 86400000LL;
291 } else if (PyDate_Check(obj)) {
292 auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
293 value = internal::PyDate_to_ms(pydate);
294 } else {
295 RETURN_NOT_OK(
296 internal::CIntFromPython(obj, &value, "Integer too large for date64"));
297 }
298 return value;
299 }
300
301 static Result<int32_t> Convert(const Time32Type* type, const O&, I obj) {
302 int32_t value;
303 if (PyTime_Check(obj)) {
304 switch (type->unit()) {
305 case TimeUnit::SECOND:
306 value = static_cast<int32_t>(internal::PyTime_to_s(obj));
307 break;
308 case TimeUnit::MILLI:
309 value = static_cast<int32_t>(internal::PyTime_to_ms(obj));
310 break;
311 default:
312 return Status::UnknownError("Invalid time unit");
313 }
314 } else {
315 RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32"));
316 }
317 return value;
318 }
319
320 static Result<int64_t> Convert(const Time64Type* type, const O&, I obj) {
321 int64_t value;
322 if (PyTime_Check(obj)) {
323 switch (type->unit()) {
324 case TimeUnit::MICRO:
325 value = internal::PyTime_to_us(obj);
326 break;
327 case TimeUnit::NANO:
328 value = internal::PyTime_to_ns(obj);
329 break;
330 default:
331 return Status::UnknownError("Invalid time unit");
332 }
333 } else {
334 RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64"));
335 }
336 return value;
337 }
338
339 static Result<int64_t> Convert(const TimestampType* type, const O& options, I obj) {
340 int64_t value, offset;
341 if (PyDateTime_Check(obj)) {
342 if (ARROW_PREDICT_FALSE(options.ignore_timezone)) {
343 offset = 0;
344 } else {
345 ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj));
346 }
347 auto dt = reinterpret_cast<PyDateTime_DateTime*>(obj);
348 switch (type->unit()) {
349 case TimeUnit::SECOND:
350 value = internal::PyDateTime_to_s(dt) - offset;
351 break;
352 case TimeUnit::MILLI:
353 value = internal::PyDateTime_to_ms(dt) - offset * 1000LL;
354 break;
355 case TimeUnit::MICRO:
356 value = internal::PyDateTime_to_us(dt) - offset * 1000000LL;
357 break;
358 case TimeUnit::NANO:
359 if (internal::IsPandasTimestamp(obj)) {
360 // pd.Timestamp value attribute contains the offset from unix epoch
361 // so no adjustment for timezone is need.
362 OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
363 RETURN_IF_PYERROR();
364 RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
365 } else {
366 // Conversion to nanoseconds can overflow -> check multiply of microseconds
367 value = internal::PyDateTime_to_us(dt);
368 if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) {
369 return internal::InvalidValue(obj,
370 "out of bounds for nanosecond resolution");
371 }
372
373 // Adjust with offset and check for overflow
374 if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL,
375 &value)) {
376 return internal::InvalidValue(obj,
377 "out of bounds for nanosecond resolution");
378 }
379 }
380 break;
381 default:
382 return Status::UnknownError("Invalid time unit");
383 }
384 } else if (PyArray_CheckAnyScalarExact(obj)) {
385 // validate that the numpy scalar has np.datetime64 dtype
386 std::shared_ptr<DataType> numpy_type;
387 RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
388 if (!numpy_type->Equals(*type)) {
389 return Status::NotImplemented("Expected np.datetime64 but got: ",
390 numpy_type->ToString());
391 }
392 return reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
393 } else {
394 RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
395 }
396 return value;
397 }
398
399 static Result<MonthDayNanoIntervalType::MonthDayNanos> Convert(
400 const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) {
401 MonthDayNanoIntervalType::MonthDayNanos output;
402 bool found_attrs = false;
403 RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kMonths>::Field(
404 obj, &output.months, &found_attrs));
405 // on relativeoffset weeks is a property calculated from days. On
406 // DateOffset is is a field on its own. timedelta doesn't have a weeks
407 // attribute.
408 PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType();
409 bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj);
410 if (!is_date_offset) {
411 RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kDaysOnly>::Field(
412 obj, &output.days, &found_attrs));
413 } else {
414 RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kWeeksAndDays>::Field(
415 obj, &output.days, &found_attrs));
416 }
417 RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kNanoseconds>::Field(
418 obj, &output.nanoseconds, &found_attrs));
419
420 if (ARROW_PREDICT_FALSE(!found_attrs) && !is_date_offset) {
421 // date_offset can have zero fields.
422 return Status::TypeError("No temporal attributes found on object.");
423 }
424 return output;
425 }
426
427 static Result<int64_t> Convert(const DurationType* type, const O&, I obj) {
428 int64_t value;
429 if (PyDelta_Check(obj)) {
430 auto dt = reinterpret_cast<PyDateTime_Delta*>(obj);
431 switch (type->unit()) {
432 case TimeUnit::SECOND:
433 value = internal::PyDelta_to_s(dt);
434 break;
435 case TimeUnit::MILLI:
436 value = internal::PyDelta_to_ms(dt);
437 break;
438 case TimeUnit::MICRO:
439 value = internal::PyDelta_to_us(dt);
440 break;
441 case TimeUnit::NANO:
442 if (internal::IsPandasTimedelta(obj)) {
443 OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
444 RETURN_IF_PYERROR();
445 RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
446 } else {
447 value = internal::PyDelta_to_ns(dt);
448 }
449 break;
450 default:
451 return Status::UnknownError("Invalid time unit");
452 }
453 } else if (PyArray_CheckAnyScalarExact(obj)) {
454 // validate that the numpy scalar has np.datetime64 dtype
455 std::shared_ptr<DataType> numpy_type;
456 RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
457 if (!numpy_type->Equals(*type)) {
458 return Status::NotImplemented("Expected np.timedelta64 but got: ",
459 numpy_type->ToString());
460 }
461 return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
462 } else {
463 RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
464 }
465 return value;
466 }
467
468 // The binary-like intermediate representation is PyBytesView because it keeps temporary
469 // python objects alive (non-contiguous memoryview) and stores whether the original
470 // object was unicode encoded or not, which is used for unicode -> bytes coersion if
471 // there is a non-unicode object observed.
472
473 static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) {
474 return view.ParseString(obj);
475 }
476
477 static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
478 PyBytesView& view) {
479 ARROW_RETURN_NOT_OK(view.ParseString(obj));
480 if (view.size != type->byte_width()) {
481 std::stringstream ss;
482 ss << "expected to be length " << type->byte_width() << " was " << view.size;
483 return internal::InvalidValue(obj, ss.str());
484 } else {
485 return Status::OK();
486 }
487 }
488
489 template <typename T>
490 static enable_if_string<T, Status> Convert(const T*, const O& options, I obj,
491 PyBytesView& view) {
492 if (options.strict) {
493 // Strict conversion, force output to be unicode / utf8 and validate that
494 // any binary values are utf8
495 ARROW_RETURN_NOT_OK(view.ParseString(obj, true));
496 if (!view.is_utf8) {
497 return internal::InvalidValue(obj, "was not a utf8 string");
498 }
499 return Status::OK();
500 } else {
501 // Non-strict conversion; keep track of whether values are unicode or bytes
502 return view.ParseString(obj);
503 }
504 }
505
506 static Result<bool> Convert(const DataType* type, const O&, I obj) {
507 return Status::NotImplemented("PyValue::Convert is not implemented for type ", type);
508 }
509};
510
511// The base Converter class is a mixin with predefined behavior and constructors.
512class PyConverter : public Converter<PyObject*, PyConversionOptions> {
513 public:
514 // Iterate over the input values and defer the conversion to the Append method
515 Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override {
516 DCHECK_GE(size, offset);
517 /// Ensure we've allocated enough space
518 RETURN_NOT_OK(this->Reserve(size - offset));
519 // Iterate over the items adding each one
520 return internal::VisitSequence(
521 values, offset,
522 [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
523 }
524
525 // Convert and append a sequence of values masked with a numpy array
526 Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size,
527 int64_t offset = 0) override {
528 DCHECK_GE(size, offset);
529 /// Ensure we've allocated enough space
530 RETURN_NOT_OK(this->Reserve(size - offset));
531 // Iterate over the items adding each one
532 return internal::VisitSequenceMasked(
533 values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) {
534 if (is_masked) {
535 return this->AppendNull();
536 } else {
537 // This will also apply the null-checking convention in the event
538 // that the value is not masked
539 return this->Append(item); // perhaps use AppendValue instead?
540 }
541 });
542 }
543};
544
545template <typename T, typename Enable = void>
546class PyPrimitiveConverter;
547
548template <typename T>
549class PyListConverter;
550
551template <typename U, typename Enable = void>
552class PyDictionaryConverter;
553
554class PyStructConverter;
555
556template <typename T, typename Enable = void>
557struct PyConverterTrait;
558
559template <typename T>
560struct PyConverterTrait<
561 T, enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
562 !is_extension_type<T>::value) ||
563 std::is_same<T, MonthDayNanoIntervalType>::value>> {
564 using type = PyPrimitiveConverter<T>;
565};
566
567template <typename T>
568struct PyConverterTrait<T, enable_if_list_like<T>> {
569 using type = PyListConverter<T>;
570};
571
572template <>
573struct PyConverterTrait<StructType> {
574 using type = PyStructConverter;
575};
576
577template <>
578struct PyConverterTrait<DictionaryType> {
579 template <typename T>
580 using dictionary_type = PyDictionaryConverter<T>;
581};
582
583template <typename T>
584class PyPrimitiveConverter<T, enable_if_null<T>>
585 : public PrimitiveConverter<T, PyConverter> {
586 public:
587 Status Append(PyObject* value) override {
588 if (PyValue::IsNull(this->options_, value)) {
589 return this->primitive_builder_->AppendNull();
590 } else {
591 ARROW_ASSIGN_OR_RAISE(
592 auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
593 return this->primitive_builder_->Append(converted);
594 }
595 }
596};
597
598template <typename T>
599class PyPrimitiveConverter<
600 T, enable_if_t<is_boolean_type<T>::value || is_number_type<T>::value ||
601 is_decimal_type<T>::value || is_date_type<T>::value ||
602 is_time_type<T>::value ||
603 std::is_same<MonthDayNanoIntervalType, T>::value>>
604 : public PrimitiveConverter<T, PyConverter> {
605 public:
606 Status Append(PyObject* value) override {
607 // Since the required space has been already allocated in the Extend functions we can
608 // rely on the Unsafe builder API which improves the performance.
609 if (PyValue::IsNull(this->options_, value)) {
610 this->primitive_builder_->UnsafeAppendNull();
611 } else {
612 ARROW_ASSIGN_OR_RAISE(
613 auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
614 this->primitive_builder_->UnsafeAppend(converted);
615 }
616 return Status::OK();
617 }
618};
619
620template <typename T>
621class PyPrimitiveConverter<
622 T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>>
623 : public PrimitiveConverter<T, PyConverter> {
624 public:
625 Status Append(PyObject* value) override {
626 if (PyValue::IsNull(this->options_, value)) {
627 this->primitive_builder_->UnsafeAppendNull();
628 } else {
629 ARROW_ASSIGN_OR_RAISE(
630 auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
631 // Numpy NaT sentinels can be checked after the conversion
632 if (PyArray_CheckAnyScalarExact(value) &&
633 PyValue::IsNaT(this->primitive_type_, converted)) {
634 this->primitive_builder_->UnsafeAppendNull();
635 } else {
636 this->primitive_builder_->UnsafeAppend(converted);
637 }
638 }
639 return Status::OK();
640 }
641};
642
643template <typename T>
644class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
645 : public PrimitiveConverter<T, PyConverter> {
646 public:
647 Status Append(PyObject* value) override {
648 if (PyValue::IsNull(this->options_, value)) {
649 this->primitive_builder_->UnsafeAppendNull();
650 } else {
651 ARROW_RETURN_NOT_OK(
652 PyValue::Convert(this->primitive_type_, this->options_, value, view_));
653 ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
654 this->primitive_builder_->UnsafeAppend(view_.bytes);
655 }
656 return Status::OK();
657 }
658
659 protected:
660 PyBytesView view_;
661};
662
663template <typename T>
664class PyPrimitiveConverter<T, enable_if_base_binary<T>>
665 : public PrimitiveConverter<T, PyConverter> {
666 public:
667 using OffsetType = typename T::offset_type;
668
669 Status Append(PyObject* value) override {
670 if (PyValue::IsNull(this->options_, value)) {
671 this->primitive_builder_->UnsafeAppendNull();
672 } else {
673 ARROW_RETURN_NOT_OK(
674 PyValue::Convert(this->primitive_type_, this->options_, value, view_));
675 if (!view_.is_utf8) {
676 // observed binary value
677 observed_binary_ = true;
678 }
679 // Since we don't know the varying length input size in advance, we need to
680 // reserve space in the value builder one by one. ReserveData raises CapacityError
681 // if the value would not fit into the array.
682 ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
683 this->primitive_builder_->UnsafeAppend(view_.bytes,
684 static_cast<OffsetType>(view_.size));
685 }
686 return Status::OK();
687 }
688
689 Result<std::shared_ptr<Array>> ToArray() override {
690 ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
691 if (observed_binary_) {
692 // if we saw any non-unicode, cast results to BinaryArray
693 auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
694 return array->View(binary_type);
695 } else {
696 return array;
697 }
698 }
699
700 protected:
701 PyBytesView view_;
702 bool observed_binary_ = false;
703};
704
705template <typename U>
706class PyDictionaryConverter<U, enable_if_has_c_type<U>>
707 : public DictionaryConverter<U, PyConverter> {
708 public:
709 Status Append(PyObject* value) override {
710 if (PyValue::IsNull(this->options_, value)) {
711 return this->value_builder_->AppendNull();
712 } else {
713 ARROW_ASSIGN_OR_RAISE(auto converted,
714 PyValue::Convert(this->value_type_, this->options_, value));
715 return this->value_builder_->Append(converted);
716 }
717 }
718};
719
720template <typename U>
721class PyDictionaryConverter<U, enable_if_has_string_view<U>>
722 : public DictionaryConverter<U, PyConverter> {
723 public:
724 Status Append(PyObject* value) override {
725 if (PyValue::IsNull(this->options_, value)) {
726 return this->value_builder_->AppendNull();
727 } else {
728 ARROW_RETURN_NOT_OK(
729 PyValue::Convert(this->value_type_, this->options_, value, view_));
730 return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
731 }
732 }
733
734 protected:
735 PyBytesView view_;
736};
737
738template <typename T>
739class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
740 public:
741 Status Append(PyObject* value) override {
742 if (PyValue::IsNull(this->options_, value)) {
743 return this->list_builder_->AppendNull();
744 }
745
746 RETURN_NOT_OK(this->list_builder_->Append());
747 if (PyArray_Check(value)) {
748 RETURN_NOT_OK(AppendNdarray(value));
749 } else if (PySequence_Check(value)) {
750 RETURN_NOT_OK(AppendSequence(value));
751 } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
752 RETURN_NOT_OK(AppendIterable(value));
753 } else {
754 return internal::InvalidType(
755 value, "was not a sequence or recognized null for conversion to list type");
756 }
757
758 return ValidateBuilder(this->list_type_);
759 }
760
761 protected:
762 Status ValidateBuilder(const MapType*) {
763 if (this->list_builder_->key_builder()->null_count() > 0) {
764 return Status::Invalid("Invalid Map: key field can not contain null values");
765 } else {
766 return Status::OK();
767 }
768 }
769
770 Status ValidateBuilder(const BaseListType*) { return Status::OK(); }
771
772 Status AppendSequence(PyObject* value) {
773 int64_t size = static_cast<int64_t>(PySequence_Size(value));
774 RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
775 return this->value_converter_->Extend(value, size);
776 }
777
778 Status AppendIterable(PyObject* value) {
779 PyObject* iterator = PyObject_GetIter(value);
780 OwnedRef iter_ref(iterator);
781 while (PyObject* item = PyIter_Next(iterator)) {
782 OwnedRef item_ref(item);
783 RETURN_NOT_OK(this->value_converter_->Reserve(1));
784 RETURN_NOT_OK(this->value_converter_->Append(item));
785 }
786 return Status::OK();
787 }
788
789 Status AppendNdarray(PyObject* value) {
790 PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(value);
791 if (PyArray_NDIM(ndarray) != 1) {
792 return Status::Invalid("Can only convert 1-dimensional array values");
793 }
794 const int64_t size = PyArray_SIZE(ndarray);
795 RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
796
797 const auto value_type = this->value_converter_->builder()->type();
798 switch (value_type->id()) {
799// If the value type does not match the expected NumPy dtype, then fall through
800// to a slower PySequence-based path
801#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \
802 case Type::TYPE_ID: { \
803 if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \
804 return this->value_converter_->Extend(value, size); \
805 } \
806 return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \
807 }
808 LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL)
809 LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8)
810 LIST_FAST_CASE(INT8, Int8Type, NPY_INT8)
811 LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16)
812 LIST_FAST_CASE(INT16, Int16Type, NPY_INT16)
813 LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32)
814 LIST_FAST_CASE(INT32, Int32Type, NPY_INT32)
815 LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64)
816 LIST_FAST_CASE(INT64, Int64Type, NPY_INT64)
817 LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16)
818 LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT)
819 LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE)
820 LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME)
821 LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA)
822#undef LIST_FAST_CASE
823 default: {
824 return this->value_converter_->Extend(value, size);
825 }
826 }
827 }
828
829 template <typename ArrowType, int NUMPY_TYPE>
830 Status AppendNdarrayTyped(PyArrayObject* ndarray) {
831 // no need to go through the conversion
832 using NumpyTrait = internal::npy_traits<NUMPY_TYPE>;
833 using NumpyType = typename NumpyTrait::value_type;
834 using ValueBuilderType = typename TypeTraits<ArrowType>::BuilderType;
835
836 const bool null_sentinels_possible =
837 // Always treat Numpy's NaT as null
838 NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA ||
839 // Observing pandas's null sentinels
840 (this->options_.from_pandas && NumpyTrait::supports_nulls);
841
842 auto value_builder =
843 checked_cast<ValueBuilderType*>(this->value_converter_->builder().get());
844
845 Ndarray1DIndexer<NumpyType> values(ndarray);
846 if (null_sentinels_possible) {
847 for (int64_t i = 0; i < values.size(); ++i) {
848 if (NumpyTrait::isnull(values[i])) {
849 RETURN_NOT_OK(value_builder->AppendNull());
850 } else {
851 RETURN_NOT_OK(value_builder->Append(values[i]));
852 }
853 }
854 } else if (!values.is_strided()) {
855 RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size()));
856 } else {
857 for (int64_t i = 0; i < values.size(); ++i) {
858 RETURN_NOT_OK(value_builder->Append(values[i]));
859 }
860 }
861 return Status::OK();
862 }
863};
864
865class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait> {
866 public:
867 Status Append(PyObject* value) override {
868 if (PyValue::IsNull(this->options_, value)) {
869 return this->struct_builder_->AppendNull();
870 }
871 switch (input_kind_) {
872 case InputKind::DICT:
873 RETURN_NOT_OK(this->struct_builder_->Append());
874 return AppendDict(value);
875 case InputKind::TUPLE:
876 RETURN_NOT_OK(this->struct_builder_->Append());
877 return AppendTuple(value);
878 case InputKind::ITEMS:
879 RETURN_NOT_OK(this->struct_builder_->Append());
880 return AppendItems(value);
881 default:
882 RETURN_NOT_OK(InferInputKind(value));
883 return Append(value);
884 }
885 }
886
887 protected:
888 Status Init(MemoryPool* pool) override {
889 RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));
890
891 // Store the field names as a PyObjects for dict matching
892 num_fields_ = this->struct_type_->num_fields();
893 bytes_field_names_.reset(PyList_New(num_fields_));
894 unicode_field_names_.reset(PyList_New(num_fields_));
895 RETURN_IF_PYERROR();
896
897 for (int i = 0; i < num_fields_; i++) {
898 const auto& field_name = this->struct_type_->field(i)->name();
899 PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size());
900 PyObject* unicode =
901 PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size());
902 RETURN_IF_PYERROR();
903 PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes);
904 PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode);
905 }
906 return Status::OK();
907 }
908
909 Status InferInputKind(PyObject* value) {
910 // Infer input object's type, note that heterogeneous sequences are not allowed
911 if (PyDict_Check(value)) {
912 input_kind_ = InputKind::DICT;
913 } else if (PyTuple_Check(value)) {
914 input_kind_ = InputKind::TUPLE;
915 } else if (PySequence_Check(value)) {
916 input_kind_ = InputKind::ITEMS;
917 } else {
918 return internal::InvalidType(value,
919 "was not a dict, tuple, or recognized null value "
920 "for conversion to struct type");
921 }
922 return Status::OK();
923 }
924
925 Status InferKeyKind(PyObject* items) {
926 for (int i = 0; i < PySequence_Length(items); i++) {
927 // retrieve the key from the passed key-value pairs
928 ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
929
930 // check key exists between the unicode field names
931 bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first);
932 RETURN_IF_PYERROR();
933 if (do_contain) {
934 key_kind_ = KeyKind::UNICODE;
935 return Status::OK();
936 }
937
938 // check key exists between the bytes field names
939 do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first);
940 RETURN_IF_PYERROR();
941 if (do_contain) {
942 key_kind_ = KeyKind::BYTES;
943 return Status::OK();
944 }
945 }
946 return Status::OK();
947 }
948
949 Status AppendEmpty() {
950 for (int i = 0; i < num_fields_; i++) {
951 RETURN_NOT_OK(this->children_[i]->Append(Py_None));
952 }
953 return Status::OK();
954 }
955
956 Status AppendTuple(PyObject* tuple) {
957 if (!PyTuple_Check(tuple)) {
958 return internal::InvalidType(tuple, "was expecting a tuple");
959 }
960 if (PyTuple_GET_SIZE(tuple) != num_fields_) {
961 return Status::Invalid("Tuple size must be equal to number of struct fields");
962 }
963 for (int i = 0; i < num_fields_; i++) {
964 PyObject* value = PyTuple_GET_ITEM(tuple, i);
965 RETURN_NOT_OK(this->children_[i]->Append(value));
966 }
967 return Status::OK();
968 }
969
970 Status AppendDict(PyObject* dict) {
971 if (!PyDict_Check(dict)) {
972 return internal::InvalidType(dict, "was expecting a dict");
973 }
974 switch (key_kind_) {
975 case KeyKind::UNICODE:
976 return AppendDict(dict, unicode_field_names_.obj());
977 case KeyKind::BYTES:
978 return AppendDict(dict, bytes_field_names_.obj());
979 default:
980 RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict)));
981 if (key_kind_ == KeyKind::UNKNOWN) {
982 // was unable to infer the type which means that all keys are absent
983 return AppendEmpty();
984 } else {
985 return AppendDict(dict);
986 }
987 }
988 }
989
990 Status AppendItems(PyObject* items) {
991 if (!PySequence_Check(items)) {
992 return internal::InvalidType(items, "was expecting a sequence of key-value items");
993 }
994 switch (key_kind_) {
995 case KeyKind::UNICODE:
996 return AppendItems(items, unicode_field_names_.obj());
997 case KeyKind::BYTES:
998 return AppendItems(items, bytes_field_names_.obj());
999 default:
1000 RETURN_NOT_OK(InferKeyKind(items));
1001 if (key_kind_ == KeyKind::UNKNOWN) {
1002 // was unable to infer the type which means that all keys are absent
1003 return AppendEmpty();
1004 } else {
1005 return AppendItems(items);
1006 }
1007 }
1008 }
1009
1010 Status AppendDict(PyObject* dict, PyObject* field_names) {
1011 // NOTE we're ignoring any extraneous dict items
1012 for (int i = 0; i < num_fields_; i++) {
1013 PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed
1014 PyObject* value = PyDict_GetItem(dict, name); // borrowed
1015 if (value == NULL) {
1016 RETURN_IF_PYERROR();
1017 }
1018 RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None));
1019 }
1020 return Status::OK();
1021 }
1022
1023 Result<std::pair<PyObject*, PyObject*>> GetKeyValuePair(PyObject* seq, int index) {
1024 PyObject* pair = PySequence_GetItem(seq, index);
1025 RETURN_IF_PYERROR();
1026 if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) {
1027 return internal::InvalidType(pair, "was expecting tuple of (key, value) pair");
1028 }
1029 PyObject* key = PyTuple_GetItem(pair, 0);
1030 RETURN_IF_PYERROR();
1031 PyObject* value = PyTuple_GetItem(pair, 1);
1032 RETURN_IF_PYERROR();
1033 return std::make_pair(key, value);
1034 }
1035
1036 Status AppendItems(PyObject* items, PyObject* field_names) {
1037 auto length = static_cast<int>(PySequence_Size(items));
1038 RETURN_IF_PYERROR();
1039
1040 // append the values for the defined fields
1041 for (int i = 0; i < std::min(num_fields_, length); i++) {
1042 // retrieve the key-value pair
1043 ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
1044
1045 // validate that the key and the field name are equal
1046 PyObject* name = PyList_GET_ITEM(field_names, i);
1047 bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ);
1048 RETURN_IF_PYERROR();
1049
1050 // finally append to the respective child builder
1051 if (are_equal) {
1052 RETURN_NOT_OK(this->children_[i]->Append(pair.second));
1053 } else {
1054 ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first));
1055 ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name));
1056 return Status::Invalid("The expected field name is `", name_view.bytes, "` but `",
1057 key_view.bytes, "` was given");
1058 }
1059 }
1060 // insert null values for missing fields
1061 for (int i = length; i < num_fields_; i++) {
1062 RETURN_NOT_OK(this->children_[i]->AppendNull());
1063 }
1064 return Status::OK();
1065 }
1066
1067 // Whether we're converting from a sequence of dicts or tuples or list of pairs
1068 enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN;
1069 // Whether the input dictionary keys' type is python bytes or unicode
1070 enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN;
1071 // Store the field names as a PyObjects for dict matching
1072 OwnedRef bytes_field_names_;
1073 OwnedRef unicode_field_names_;
1074 // Store the number of fields for later reuse
1075 int num_fields_;
1076};
1077
1078// Convert *obj* to a sequence if necessary
1079// Fill *size* to its length. If >= 0 on entry, *size* is an upper size
1080// bound that may lead to truncation.
1081Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) {
1082 if (PySequence_Check(obj)) {
1083 // obj is already a sequence
1084 int64_t real_size = static_cast<int64_t>(PySequence_Size(obj));
1085 if (*size < 0) {
1086 *size = real_size;
1087 } else {
1088 *size = std::min(real_size, *size);
1089 }
1090 Py_INCREF(obj);
1091 *seq = obj;
1092 } else if (*size < 0) {
1093 // unknown size, exhaust iterator
1094 *seq = PySequence_List(obj);
1095 RETURN_IF_PYERROR();
1096 *size = static_cast<int64_t>(PyList_GET_SIZE(*seq));
1097 } else {
1098 // size is known but iterator could be infinite
1099 Py_ssize_t i, n = *size;
1100 PyObject* iter = PyObject_GetIter(obj);
1101 RETURN_IF_PYERROR();
1102 OwnedRef iter_ref(iter);
1103 PyObject* lst = PyList_New(n);
1104 RETURN_IF_PYERROR();
1105 for (i = 0; i < n; i++) {
1106 PyObject* item = PyIter_Next(iter);
1107 if (!item) break;
1108 PyList_SET_ITEM(lst, i, item);
1109 }
1110 // Shrink list if len(iterator) < size
1111 if (i < n && PyList_SetSlice(lst, i, n, NULL)) {
1112 Py_DECREF(lst);
1113 return Status::UnknownError("failed to resize list");
1114 }
1115 *seq = lst;
1116 *size = std::min<int64_t>(i, *size);
1117 }
1118 return Status::OK();
1119}
1120
1121} // namespace
1122
1123Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* mask,
1124 PyConversionOptions options,
1125 MemoryPool* pool) {
1126 PyAcquireGIL lock;
1127
1128 PyObject* seq;
1129 OwnedRef tmp_seq_nanny;
1130
1131 ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas"));
1132 if (is_pandas_imported) {
1133 // If pandas has been already imported initialize the static pandas objects to
1134 // support converting from pd.Timedelta and pd.Timestamp objects
1135 internal::InitPandasStaticData();
1136 }
1137
1138 int64_t size = options.size;
1139 RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size));
1140 tmp_seq_nanny.reset(seq);
1141
1142 // In some cases, type inference may be "loose", like strings. If the user
1143 // passed pa.string(), then we will error if we encounter any non-UTF8
1144 // value. If not, then we will allow the result to be a BinaryArray
1145 if (options.type == nullptr) {
1146 ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
1147 options.strict = false;
1148 } else {
1149 options.strict = true;
1150 }
1151 DCHECK_GE(size, 0);
1152
1153 ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
1154 options.type, options, pool)));
1155 if (converter->may_overflow()) {
1156 // The converter hierarchy contains binary- or list-like builders which can overflow
1157 // depending on the input values. Wrap the converter with a chunker which detects
1158 // the overflow and automatically creates new chunks.
1159 ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter)));
1160 if (mask != nullptr && mask != Py_None) {
1161 RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size));
1162 } else {
1163 RETURN_NOT_OK(chunked_converter->Extend(seq, size));
1164 }
1165 return chunked_converter->ToChunkedArray();
1166 } else {
1167 // If the converter can't overflow spare the capacity error checking on the hot-path,
1168 // this improves the performance roughly by ~10% for primitive types.
1169 if (mask != nullptr && mask != Py_None) {
1170 RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size));
1171 } else {
1172 RETURN_NOT_OK(converter->Extend(seq, size));
1173 }
1174 return converter->ToChunkedArray();
1175 }
1176}
1177
1178} // namespace py
1179} // namespace arrow