]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "arrow/python/python_to_arrow.h" | |
19 | #include "arrow/python/numpy_interop.h" | |
20 | ||
21 | #include <datetime.h> | |
22 | ||
23 | #include <algorithm> | |
24 | #include <limits> | |
25 | #include <sstream> | |
26 | #include <string> | |
27 | #include <utility> | |
28 | #include <vector> | |
29 | ||
30 | #include "arrow/array.h" | |
31 | #include "arrow/array/builder_binary.h" | |
32 | #include "arrow/array/builder_decimal.h" | |
33 | #include "arrow/array/builder_dict.h" | |
34 | #include "arrow/array/builder_nested.h" | |
35 | #include "arrow/array/builder_primitive.h" | |
36 | #include "arrow/array/builder_time.h" | |
37 | #include "arrow/chunked_array.h" | |
38 | #include "arrow/status.h" | |
39 | #include "arrow/type.h" | |
40 | #include "arrow/type_traits.h" | |
41 | #include "arrow/util/checked_cast.h" | |
42 | #include "arrow/util/converter.h" | |
43 | #include "arrow/util/decimal.h" | |
44 | #include "arrow/util/int_util_internal.h" | |
45 | #include "arrow/util/logging.h" | |
46 | ||
47 | #include "arrow/python/datetime.h" | |
48 | #include "arrow/python/decimal.h" | |
49 | #include "arrow/python/helpers.h" | |
50 | #include "arrow/python/inference.h" | |
51 | #include "arrow/python/iterators.h" | |
52 | #include "arrow/python/numpy_convert.h" | |
53 | #include "arrow/python/type_traits.h" | |
54 | #include "arrow/visitor_inline.h" | |
55 | ||
56 | namespace arrow { | |
57 | ||
58 | using internal::checked_cast; | |
59 | using internal::checked_pointer_cast; | |
60 | ||
61 | using internal::Converter; | |
62 | using internal::DictionaryConverter; | |
63 | using internal::ListConverter; | |
64 | using internal::PrimitiveConverter; | |
65 | using internal::StructConverter; | |
66 | ||
67 | using internal::MakeChunker; | |
68 | using internal::MakeConverter; | |
69 | ||
70 | namespace py { | |
71 | ||
72 | namespace { | |
73 | enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds }; | |
74 | ||
75 | template <MonthDayNanoField field> | |
76 | struct MonthDayNanoTraits; | |
77 | ||
78 | struct MonthDayNanoAttrData { | |
79 | const char* name; | |
80 | const int64_t multiplier; | |
81 | }; | |
82 | ||
83 | template <> | |
84 | struct MonthDayNanoTraits<MonthDayNanoField::kMonths> { | |
85 | using c_type = int32_t; | |
86 | static const MonthDayNanoAttrData attrs[]; | |
87 | }; | |
88 | ||
89 | const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kMonths>::attrs[] = { | |
90 | {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}}; | |
91 | ||
92 | template <> | |
93 | struct MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays> { | |
94 | using c_type = int32_t; | |
95 | static const MonthDayNanoAttrData attrs[]; | |
96 | }; | |
97 | ||
98 | const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays>::attrs[] = | |
99 | {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}}; | |
100 | ||
101 | template <> | |
102 | struct MonthDayNanoTraits<MonthDayNanoField::kDaysOnly> { | |
103 | using c_type = int32_t; | |
104 | static const MonthDayNanoAttrData attrs[]; | |
105 | }; | |
106 | ||
107 | const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kDaysOnly>::attrs[] = { | |
108 | {"days", 1}, {nullptr, 0}}; | |
109 | ||
110 | template <> | |
111 | struct MonthDayNanoTraits<MonthDayNanoField::kNanoseconds> { | |
112 | using c_type = int64_t; | |
113 | static const MonthDayNanoAttrData attrs[]; | |
114 | }; | |
115 | ||
116 | const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kNanoseconds>::attrs[] = | |
117 | {{"hours", 1}, | |
118 | {"minutes", /*minutes_in_hours=*/60}, | |
119 | {"seconds", /*seconds_in_minute=*/60}, | |
120 | {"milliseconds", /*milliseconds_in_seconds*/ 1000}, | |
121 | {"microseconds", /*microseconds_in_millseconds=*/1000}, | |
122 | {"nanoseconds", /*nanoseconds_in_microseconds=*/1000}, | |
123 | {nullptr, 0}}; | |
124 | ||
125 | template <MonthDayNanoField field> | |
126 | struct PopulateMonthDayNano { | |
127 | using Traits = MonthDayNanoTraits<field>; | |
128 | using field_c_type = typename Traits::c_type; | |
129 | ||
130 | static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { | |
131 | *out = 0; | |
132 | for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; | |
133 | ++attr) { | |
134 | if (attr->multiplier != 1 && | |
135 | ::arrow::internal::MultiplyWithOverflow( | |
136 | static_cast<field_c_type>(attr->multiplier), *out, out)) { | |
137 | return Status::Invalid("Overflow on: ", (attr - 1)->name, | |
138 | " for: ", internal::PyObject_StdStringRepr(obj)); | |
139 | } | |
140 | ||
141 | OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); | |
142 | if (field_value.obj() == nullptr) { | |
143 | // No attribute present, skip to the next one. | |
144 | PyErr_Clear(); | |
145 | continue; | |
146 | } | |
147 | RETURN_IF_PYERROR(); | |
148 | *found_attrs = true; | |
149 | field_c_type value; | |
150 | RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); | |
151 | if (::arrow::internal::AddWithOverflow(*out, value, out)) { | |
152 | return Status::Invalid("Overflow on: ", attr->name, | |
153 | " for: ", internal::PyObject_StdStringRepr(obj)); | |
154 | } | |
155 | } | |
156 | ||
157 | return Status::OK(); | |
158 | } | |
159 | }; | |
160 | ||
161 | // Utility for converting single python objects to their intermediate C representations | |
162 | // which can be fed to the typed builders | |
163 | class PyValue { | |
164 | public: | |
165 | // Type aliases for shorter signature definitions | |
166 | using I = PyObject*; | |
167 | using O = PyConversionOptions; | |
168 | ||
169 | // Used for null checking before actually converting the values | |
170 | static bool IsNull(const O& options, I obj) { | |
171 | if (options.from_pandas) { | |
172 | return internal::PandasObjectIsNull(obj); | |
173 | } else { | |
174 | return obj == Py_None; | |
175 | } | |
176 | } | |
177 | ||
178 | // Used for post-conversion numpy NaT sentinel checking | |
179 | static bool IsNaT(const TimestampType*, int64_t value) { | |
180 | return internal::npy_traits<NPY_DATETIME>::isnull(value); | |
181 | } | |
182 | ||
183 | // Used for post-conversion numpy NaT sentinel checking | |
184 | static bool IsNaT(const DurationType*, int64_t value) { | |
185 | return internal::npy_traits<NPY_TIMEDELTA>::isnull(value); | |
186 | } | |
187 | ||
188 | static Result<std::nullptr_t> Convert(const NullType*, const O&, I obj) { | |
189 | if (obj == Py_None) { | |
190 | return nullptr; | |
191 | } else { | |
192 | return Status::Invalid("Invalid null value"); | |
193 | } | |
194 | } | |
195 | ||
196 | static Result<bool> Convert(const BooleanType*, const O&, I obj) { | |
197 | if (obj == Py_True) { | |
198 | return true; | |
199 | } else if (obj == Py_False) { | |
200 | return false; | |
201 | } else if (PyArray_IsScalar(obj, Bool)) { | |
202 | return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE; | |
203 | } else { | |
204 | return internal::InvalidValue(obj, "tried to convert to boolean"); | |
205 | } | |
206 | } | |
207 | ||
208 | template <typename T> | |
209 | static enable_if_integer<T, Result<typename T::c_type>> Convert(const T* type, const O&, | |
210 | I obj) { | |
211 | typename T::c_type value; | |
212 | auto status = internal::CIntFromPython(obj, &value); | |
213 | if (ARROW_PREDICT_TRUE(status.ok())) { | |
214 | return value; | |
215 | } else if (!internal::PyIntScalar_Check(obj)) { | |
216 | std::stringstream ss; | |
217 | ss << "tried to convert to " << type->ToString(); | |
218 | return internal::InvalidValue(obj, ss.str()); | |
219 | } else { | |
220 | return status; | |
221 | } | |
222 | } | |
223 | ||
224 | static Result<uint16_t> Convert(const HalfFloatType*, const O&, I obj) { | |
225 | uint16_t value; | |
226 | RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); | |
227 | return value; | |
228 | } | |
229 | ||
230 | static Result<float> Convert(const FloatType*, const O&, I obj) { | |
231 | float value; | |
232 | if (internal::PyFloatScalar_Check(obj)) { | |
233 | value = static_cast<float>(PyFloat_AsDouble(obj)); | |
234 | RETURN_IF_PYERROR(); | |
235 | } else if (internal::PyIntScalar_Check(obj)) { | |
236 | RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); | |
237 | } else { | |
238 | return internal::InvalidValue(obj, "tried to convert to float32"); | |
239 | } | |
240 | return value; | |
241 | } | |
242 | ||
243 | static Result<double> Convert(const DoubleType*, const O&, I obj) { | |
244 | double value; | |
245 | if (PyFloat_Check(obj)) { | |
246 | value = PyFloat_AS_DOUBLE(obj); | |
247 | } else if (internal::PyFloatScalar_Check(obj)) { | |
248 | // Other kinds of float-y things | |
249 | value = PyFloat_AsDouble(obj); | |
250 | RETURN_IF_PYERROR(); | |
251 | } else if (internal::PyIntScalar_Check(obj)) { | |
252 | RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value)); | |
253 | } else { | |
254 | return internal::InvalidValue(obj, "tried to convert to double"); | |
255 | } | |
256 | return value; | |
257 | } | |
258 | ||
259 | static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) { | |
260 | Decimal128 value; | |
261 | RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); | |
262 | return value; | |
263 | } | |
264 | ||
265 | static Result<Decimal256> Convert(const Decimal256Type* type, const O&, I obj) { | |
266 | Decimal256 value; | |
267 | RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); | |
268 | return value; | |
269 | } | |
270 | ||
271 | static Result<int32_t> Convert(const Date32Type*, const O&, I obj) { | |
272 | int32_t value; | |
273 | if (PyDate_Check(obj)) { | |
274 | auto pydate = reinterpret_cast<PyDateTime_Date*>(obj); | |
275 | value = static_cast<int32_t>(internal::PyDate_to_days(pydate)); | |
276 | } else { | |
277 | RETURN_NOT_OK( | |
278 | internal::CIntFromPython(obj, &value, "Integer too large for date32")); | |
279 | } | |
280 | return value; | |
281 | } | |
282 | ||
283 | static Result<int64_t> Convert(const Date64Type*, const O&, I obj) { | |
284 | int64_t value; | |
285 | if (PyDateTime_Check(obj)) { | |
286 | auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj); | |
287 | value = internal::PyDateTime_to_ms(pydate); | |
288 | // Truncate any intraday milliseconds | |
289 | // TODO: introduce an option for this | |
290 | value -= value % 86400000LL; | |
291 | } else if (PyDate_Check(obj)) { | |
292 | auto pydate = reinterpret_cast<PyDateTime_Date*>(obj); | |
293 | value = internal::PyDate_to_ms(pydate); | |
294 | } else { | |
295 | RETURN_NOT_OK( | |
296 | internal::CIntFromPython(obj, &value, "Integer too large for date64")); | |
297 | } | |
298 | return value; | |
299 | } | |
300 | ||
301 | static Result<int32_t> Convert(const Time32Type* type, const O&, I obj) { | |
302 | int32_t value; | |
303 | if (PyTime_Check(obj)) { | |
304 | switch (type->unit()) { | |
305 | case TimeUnit::SECOND: | |
306 | value = static_cast<int32_t>(internal::PyTime_to_s(obj)); | |
307 | break; | |
308 | case TimeUnit::MILLI: | |
309 | value = static_cast<int32_t>(internal::PyTime_to_ms(obj)); | |
310 | break; | |
311 | default: | |
312 | return Status::UnknownError("Invalid time unit"); | |
313 | } | |
314 | } else { | |
315 | RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); | |
316 | } | |
317 | return value; | |
318 | } | |
319 | ||
320 | static Result<int64_t> Convert(const Time64Type* type, const O&, I obj) { | |
321 | int64_t value; | |
322 | if (PyTime_Check(obj)) { | |
323 | switch (type->unit()) { | |
324 | case TimeUnit::MICRO: | |
325 | value = internal::PyTime_to_us(obj); | |
326 | break; | |
327 | case TimeUnit::NANO: | |
328 | value = internal::PyTime_to_ns(obj); | |
329 | break; | |
330 | default: | |
331 | return Status::UnknownError("Invalid time unit"); | |
332 | } | |
333 | } else { | |
334 | RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); | |
335 | } | |
336 | return value; | |
337 | } | |
338 | ||
339 | static Result<int64_t> Convert(const TimestampType* type, const O& options, I obj) { | |
340 | int64_t value, offset; | |
341 | if (PyDateTime_Check(obj)) { | |
342 | if (ARROW_PREDICT_FALSE(options.ignore_timezone)) { | |
343 | offset = 0; | |
344 | } else { | |
345 | ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj)); | |
346 | } | |
347 | auto dt = reinterpret_cast<PyDateTime_DateTime*>(obj); | |
348 | switch (type->unit()) { | |
349 | case TimeUnit::SECOND: | |
350 | value = internal::PyDateTime_to_s(dt) - offset; | |
351 | break; | |
352 | case TimeUnit::MILLI: | |
353 | value = internal::PyDateTime_to_ms(dt) - offset * 1000LL; | |
354 | break; | |
355 | case TimeUnit::MICRO: | |
356 | value = internal::PyDateTime_to_us(dt) - offset * 1000000LL; | |
357 | break; | |
358 | case TimeUnit::NANO: | |
359 | if (internal::IsPandasTimestamp(obj)) { | |
360 | // pd.Timestamp value attribute contains the offset from unix epoch | |
361 | // so no adjustment for timezone is need. | |
362 | OwnedRef nanos(PyObject_GetAttrString(obj, "value")); | |
363 | RETURN_IF_PYERROR(); | |
364 | RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); | |
365 | } else { | |
366 | // Conversion to nanoseconds can overflow -> check multiply of microseconds | |
367 | value = internal::PyDateTime_to_us(dt); | |
368 | if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) { | |
369 | return internal::InvalidValue(obj, | |
370 | "out of bounds for nanosecond resolution"); | |
371 | } | |
372 | ||
373 | // Adjust with offset and check for overflow | |
374 | if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL, | |
375 | &value)) { | |
376 | return internal::InvalidValue(obj, | |
377 | "out of bounds for nanosecond resolution"); | |
378 | } | |
379 | } | |
380 | break; | |
381 | default: | |
382 | return Status::UnknownError("Invalid time unit"); | |
383 | } | |
384 | } else if (PyArray_CheckAnyScalarExact(obj)) { | |
385 | // validate that the numpy scalar has np.datetime64 dtype | |
386 | std::shared_ptr<DataType> numpy_type; | |
387 | RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); | |
388 | if (!numpy_type->Equals(*type)) { | |
389 | return Status::NotImplemented("Expected np.datetime64 but got: ", | |
390 | numpy_type->ToString()); | |
391 | } | |
392 | return reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval; | |
393 | } else { | |
394 | RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); | |
395 | } | |
396 | return value; | |
397 | } | |
398 | ||
399 | static Result<MonthDayNanoIntervalType::MonthDayNanos> Convert( | |
400 | const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) { | |
401 | MonthDayNanoIntervalType::MonthDayNanos output; | |
402 | bool found_attrs = false; | |
403 | RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kMonths>::Field( | |
404 | obj, &output.months, &found_attrs)); | |
405 | // on relativeoffset weeks is a property calculated from days. On | |
406 | // DateOffset is is a field on its own. timedelta doesn't have a weeks | |
407 | // attribute. | |
408 | PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); | |
409 | bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); | |
410 | if (!is_date_offset) { | |
411 | RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kDaysOnly>::Field( | |
412 | obj, &output.days, &found_attrs)); | |
413 | } else { | |
414 | RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kWeeksAndDays>::Field( | |
415 | obj, &output.days, &found_attrs)); | |
416 | } | |
417 | RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kNanoseconds>::Field( | |
418 | obj, &output.nanoseconds, &found_attrs)); | |
419 | ||
420 | if (ARROW_PREDICT_FALSE(!found_attrs) && !is_date_offset) { | |
421 | // date_offset can have zero fields. | |
422 | return Status::TypeError("No temporal attributes found on object."); | |
423 | } | |
424 | return output; | |
425 | } | |
426 | ||
427 | static Result<int64_t> Convert(const DurationType* type, const O&, I obj) { | |
428 | int64_t value; | |
429 | if (PyDelta_Check(obj)) { | |
430 | auto dt = reinterpret_cast<PyDateTime_Delta*>(obj); | |
431 | switch (type->unit()) { | |
432 | case TimeUnit::SECOND: | |
433 | value = internal::PyDelta_to_s(dt); | |
434 | break; | |
435 | case TimeUnit::MILLI: | |
436 | value = internal::PyDelta_to_ms(dt); | |
437 | break; | |
438 | case TimeUnit::MICRO: | |
439 | value = internal::PyDelta_to_us(dt); | |
440 | break; | |
441 | case TimeUnit::NANO: | |
442 | if (internal::IsPandasTimedelta(obj)) { | |
443 | OwnedRef nanos(PyObject_GetAttrString(obj, "value")); | |
444 | RETURN_IF_PYERROR(); | |
445 | RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); | |
446 | } else { | |
447 | value = internal::PyDelta_to_ns(dt); | |
448 | } | |
449 | break; | |
450 | default: | |
451 | return Status::UnknownError("Invalid time unit"); | |
452 | } | |
453 | } else if (PyArray_CheckAnyScalarExact(obj)) { | |
454 | // validate that the numpy scalar has np.datetime64 dtype | |
455 | std::shared_ptr<DataType> numpy_type; | |
456 | RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); | |
457 | if (!numpy_type->Equals(*type)) { | |
458 | return Status::NotImplemented("Expected np.timedelta64 but got: ", | |
459 | numpy_type->ToString()); | |
460 | } | |
461 | return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval; | |
462 | } else { | |
463 | RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); | |
464 | } | |
465 | return value; | |
466 | } | |
467 | ||
468 | // The binary-like intermediate representation is PyBytesView because it keeps temporary | |
469 | // python objects alive (non-contiguous memoryview) and stores whether the original | |
470 | // object was unicode encoded or not, which is used for unicode -> bytes coersion if | |
471 | // there is a non-unicode object observed. | |
472 | ||
473 | static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { | |
474 | return view.ParseString(obj); | |
475 | } | |
476 | ||
477 | static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, | |
478 | PyBytesView& view) { | |
479 | ARROW_RETURN_NOT_OK(view.ParseString(obj)); | |
480 | if (view.size != type->byte_width()) { | |
481 | std::stringstream ss; | |
482 | ss << "expected to be length " << type->byte_width() << " was " << view.size; | |
483 | return internal::InvalidValue(obj, ss.str()); | |
484 | } else { | |
485 | return Status::OK(); | |
486 | } | |
487 | } | |
488 | ||
489 | template <typename T> | |
490 | static enable_if_string<T, Status> Convert(const T*, const O& options, I obj, | |
491 | PyBytesView& view) { | |
492 | if (options.strict) { | |
493 | // Strict conversion, force output to be unicode / utf8 and validate that | |
494 | // any binary values are utf8 | |
495 | ARROW_RETURN_NOT_OK(view.ParseString(obj, true)); | |
496 | if (!view.is_utf8) { | |
497 | return internal::InvalidValue(obj, "was not a utf8 string"); | |
498 | } | |
499 | return Status::OK(); | |
500 | } else { | |
501 | // Non-strict conversion; keep track of whether values are unicode or bytes | |
502 | return view.ParseString(obj); | |
503 | } | |
504 | } | |
505 | ||
506 | static Result<bool> Convert(const DataType* type, const O&, I obj) { | |
507 | return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); | |
508 | } | |
509 | }; | |
510 | ||
511 | // The base Converter class is a mixin with predefined behavior and constructors. | |
512 | class PyConverter : public Converter<PyObject*, PyConversionOptions> { | |
513 | public: | |
514 | // Iterate over the input values and defer the conversion to the Append method | |
515 | Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override { | |
516 | DCHECK_GE(size, offset); | |
517 | /// Ensure we've allocated enough space | |
518 | RETURN_NOT_OK(this->Reserve(size - offset)); | |
519 | // Iterate over the items adding each one | |
520 | return internal::VisitSequence( | |
521 | values, offset, | |
522 | [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); | |
523 | } | |
524 | ||
525 | // Convert and append a sequence of values masked with a numpy array | |
526 | Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size, | |
527 | int64_t offset = 0) override { | |
528 | DCHECK_GE(size, offset); | |
529 | /// Ensure we've allocated enough space | |
530 | RETURN_NOT_OK(this->Reserve(size - offset)); | |
531 | // Iterate over the items adding each one | |
532 | return internal::VisitSequenceMasked( | |
533 | values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) { | |
534 | if (is_masked) { | |
535 | return this->AppendNull(); | |
536 | } else { | |
537 | // This will also apply the null-checking convention in the event | |
538 | // that the value is not masked | |
539 | return this->Append(item); // perhaps use AppendValue instead? | |
540 | } | |
541 | }); | |
542 | } | |
543 | }; | |
544 | ||
545 | template <typename T, typename Enable = void> | |
546 | class PyPrimitiveConverter; | |
547 | ||
548 | template <typename T> | |
549 | class PyListConverter; | |
550 | ||
551 | template <typename U, typename Enable = void> | |
552 | class PyDictionaryConverter; | |
553 | ||
554 | class PyStructConverter; | |
555 | ||
556 | template <typename T, typename Enable = void> | |
557 | struct PyConverterTrait; | |
558 | ||
559 | template <typename T> | |
560 | struct PyConverterTrait< | |
561 | T, enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value && | |
562 | !is_extension_type<T>::value) || | |
563 | std::is_same<T, MonthDayNanoIntervalType>::value>> { | |
564 | using type = PyPrimitiveConverter<T>; | |
565 | }; | |
566 | ||
567 | template <typename T> | |
568 | struct PyConverterTrait<T, enable_if_list_like<T>> { | |
569 | using type = PyListConverter<T>; | |
570 | }; | |
571 | ||
572 | template <> | |
573 | struct PyConverterTrait<StructType> { | |
574 | using type = PyStructConverter; | |
575 | }; | |
576 | ||
577 | template <> | |
578 | struct PyConverterTrait<DictionaryType> { | |
579 | template <typename T> | |
580 | using dictionary_type = PyDictionaryConverter<T>; | |
581 | }; | |
582 | ||
583 | template <typename T> | |
584 | class PyPrimitiveConverter<T, enable_if_null<T>> | |
585 | : public PrimitiveConverter<T, PyConverter> { | |
586 | public: | |
587 | Status Append(PyObject* value) override { | |
588 | if (PyValue::IsNull(this->options_, value)) { | |
589 | return this->primitive_builder_->AppendNull(); | |
590 | } else { | |
591 | ARROW_ASSIGN_OR_RAISE( | |
592 | auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); | |
593 | return this->primitive_builder_->Append(converted); | |
594 | } | |
595 | } | |
596 | }; | |
597 | ||
598 | template <typename T> | |
599 | class PyPrimitiveConverter< | |
600 | T, enable_if_t<is_boolean_type<T>::value || is_number_type<T>::value || | |
601 | is_decimal_type<T>::value || is_date_type<T>::value || | |
602 | is_time_type<T>::value || | |
603 | std::is_same<MonthDayNanoIntervalType, T>::value>> | |
604 | : public PrimitiveConverter<T, PyConverter> { | |
605 | public: | |
606 | Status Append(PyObject* value) override { | |
607 | // Since the required space has been already allocated in the Extend functions we can | |
608 | // rely on the Unsafe builder API which improves the performance. | |
609 | if (PyValue::IsNull(this->options_, value)) { | |
610 | this->primitive_builder_->UnsafeAppendNull(); | |
611 | } else { | |
612 | ARROW_ASSIGN_OR_RAISE( | |
613 | auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); | |
614 | this->primitive_builder_->UnsafeAppend(converted); | |
615 | } | |
616 | return Status::OK(); | |
617 | } | |
618 | }; | |
619 | ||
620 | template <typename T> | |
621 | class PyPrimitiveConverter< | |
622 | T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>> | |
623 | : public PrimitiveConverter<T, PyConverter> { | |
624 | public: | |
625 | Status Append(PyObject* value) override { | |
626 | if (PyValue::IsNull(this->options_, value)) { | |
627 | this->primitive_builder_->UnsafeAppendNull(); | |
628 | } else { | |
629 | ARROW_ASSIGN_OR_RAISE( | |
630 | auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); | |
631 | // Numpy NaT sentinels can be checked after the conversion | |
632 | if (PyArray_CheckAnyScalarExact(value) && | |
633 | PyValue::IsNaT(this->primitive_type_, converted)) { | |
634 | this->primitive_builder_->UnsafeAppendNull(); | |
635 | } else { | |
636 | this->primitive_builder_->UnsafeAppend(converted); | |
637 | } | |
638 | } | |
639 | return Status::OK(); | |
640 | } | |
641 | }; | |
642 | ||
643 | template <typename T> | |
644 | class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>> | |
645 | : public PrimitiveConverter<T, PyConverter> { | |
646 | public: | |
647 | Status Append(PyObject* value) override { | |
648 | if (PyValue::IsNull(this->options_, value)) { | |
649 | this->primitive_builder_->UnsafeAppendNull(); | |
650 | } else { | |
651 | ARROW_RETURN_NOT_OK( | |
652 | PyValue::Convert(this->primitive_type_, this->options_, value, view_)); | |
653 | ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); | |
654 | this->primitive_builder_->UnsafeAppend(view_.bytes); | |
655 | } | |
656 | return Status::OK(); | |
657 | } | |
658 | ||
659 | protected: | |
660 | PyBytesView view_; | |
661 | }; | |
662 | ||
663 | template <typename T> | |
664 | class PyPrimitiveConverter<T, enable_if_base_binary<T>> | |
665 | : public PrimitiveConverter<T, PyConverter> { | |
666 | public: | |
667 | using OffsetType = typename T::offset_type; | |
668 | ||
669 | Status Append(PyObject* value) override { | |
670 | if (PyValue::IsNull(this->options_, value)) { | |
671 | this->primitive_builder_->UnsafeAppendNull(); | |
672 | } else { | |
673 | ARROW_RETURN_NOT_OK( | |
674 | PyValue::Convert(this->primitive_type_, this->options_, value, view_)); | |
675 | if (!view_.is_utf8) { | |
676 | // observed binary value | |
677 | observed_binary_ = true; | |
678 | } | |
679 | // Since we don't know the varying length input size in advance, we need to | |
680 | // reserve space in the value builder one by one. ReserveData raises CapacityError | |
681 | // if the value would not fit into the array. | |
682 | ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); | |
683 | this->primitive_builder_->UnsafeAppend(view_.bytes, | |
684 | static_cast<OffsetType>(view_.size)); | |
685 | } | |
686 | return Status::OK(); | |
687 | } | |
688 | ||
689 | Result<std::shared_ptr<Array>> ToArray() override { | |
690 | ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray())); | |
691 | if (observed_binary_) { | |
692 | // if we saw any non-unicode, cast results to BinaryArray | |
693 | auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton(); | |
694 | return array->View(binary_type); | |
695 | } else { | |
696 | return array; | |
697 | } | |
698 | } | |
699 | ||
700 | protected: | |
701 | PyBytesView view_; | |
702 | bool observed_binary_ = false; | |
703 | }; | |
704 | ||
705 | template <typename U> | |
706 | class PyDictionaryConverter<U, enable_if_has_c_type<U>> | |
707 | : public DictionaryConverter<U, PyConverter> { | |
708 | public: | |
709 | Status Append(PyObject* value) override { | |
710 | if (PyValue::IsNull(this->options_, value)) { | |
711 | return this->value_builder_->AppendNull(); | |
712 | } else { | |
713 | ARROW_ASSIGN_OR_RAISE(auto converted, | |
714 | PyValue::Convert(this->value_type_, this->options_, value)); | |
715 | return this->value_builder_->Append(converted); | |
716 | } | |
717 | } | |
718 | }; | |
719 | ||
720 | template <typename U> | |
721 | class PyDictionaryConverter<U, enable_if_has_string_view<U>> | |
722 | : public DictionaryConverter<U, PyConverter> { | |
723 | public: | |
724 | Status Append(PyObject* value) override { | |
725 | if (PyValue::IsNull(this->options_, value)) { | |
726 | return this->value_builder_->AppendNull(); | |
727 | } else { | |
728 | ARROW_RETURN_NOT_OK( | |
729 | PyValue::Convert(this->value_type_, this->options_, value, view_)); | |
730 | return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size)); | |
731 | } | |
732 | } | |
733 | ||
734 | protected: | |
735 | PyBytesView view_; | |
736 | }; | |
737 | ||
738 | template <typename T> | |
739 | class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> { | |
740 | public: | |
741 | Status Append(PyObject* value) override { | |
742 | if (PyValue::IsNull(this->options_, value)) { | |
743 | return this->list_builder_->AppendNull(); | |
744 | } | |
745 | ||
746 | RETURN_NOT_OK(this->list_builder_->Append()); | |
747 | if (PyArray_Check(value)) { | |
748 | RETURN_NOT_OK(AppendNdarray(value)); | |
749 | } else if (PySequence_Check(value)) { | |
750 | RETURN_NOT_OK(AppendSequence(value)); | |
751 | } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) { | |
752 | RETURN_NOT_OK(AppendIterable(value)); | |
753 | } else { | |
754 | return internal::InvalidType( | |
755 | value, "was not a sequence or recognized null for conversion to list type"); | |
756 | } | |
757 | ||
758 | return ValidateBuilder(this->list_type_); | |
759 | } | |
760 | ||
761 | protected: | |
762 | Status ValidateBuilder(const MapType*) { | |
763 | if (this->list_builder_->key_builder()->null_count() > 0) { | |
764 | return Status::Invalid("Invalid Map: key field can not contain null values"); | |
765 | } else { | |
766 | return Status::OK(); | |
767 | } | |
768 | } | |
769 | ||
770 | Status ValidateBuilder(const BaseListType*) { return Status::OK(); } | |
771 | ||
772 | Status AppendSequence(PyObject* value) { | |
773 | int64_t size = static_cast<int64_t>(PySequence_Size(value)); | |
774 | RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); | |
775 | return this->value_converter_->Extend(value, size); | |
776 | } | |
777 | ||
778 | Status AppendIterable(PyObject* value) { | |
779 | PyObject* iterator = PyObject_GetIter(value); | |
780 | OwnedRef iter_ref(iterator); | |
781 | while (PyObject* item = PyIter_Next(iterator)) { | |
782 | OwnedRef item_ref(item); | |
783 | RETURN_NOT_OK(this->value_converter_->Reserve(1)); | |
784 | RETURN_NOT_OK(this->value_converter_->Append(item)); | |
785 | } | |
786 | return Status::OK(); | |
787 | } | |
788 | ||
789 | Status AppendNdarray(PyObject* value) { | |
790 | PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(value); | |
791 | if (PyArray_NDIM(ndarray) != 1) { | |
792 | return Status::Invalid("Can only convert 1-dimensional array values"); | |
793 | } | |
794 | const int64_t size = PyArray_SIZE(ndarray); | |
795 | RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); | |
796 | ||
797 | const auto value_type = this->value_converter_->builder()->type(); | |
798 | switch (value_type->id()) { | |
799 | // If the value type does not match the expected NumPy dtype, then fall through | |
800 | // to a slower PySequence-based path | |
801 | #define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ | |
802 | case Type::TYPE_ID: { \ | |
803 | if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ | |
804 | return this->value_converter_->Extend(value, size); \ | |
805 | } \ | |
806 | return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \ | |
807 | } | |
808 | LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL) | |
809 | LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) | |
810 | LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) | |
811 | LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16) | |
812 | LIST_FAST_CASE(INT16, Int16Type, NPY_INT16) | |
813 | LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32) | |
814 | LIST_FAST_CASE(INT32, Int32Type, NPY_INT32) | |
815 | LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64) | |
816 | LIST_FAST_CASE(INT64, Int64Type, NPY_INT64) | |
817 | LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16) | |
818 | LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT) | |
819 | LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE) | |
820 | LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME) | |
821 | LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) | |
822 | #undef LIST_FAST_CASE | |
823 | default: { | |
824 | return this->value_converter_->Extend(value, size); | |
825 | } | |
826 | } | |
827 | } | |
828 | ||
829 | template <typename ArrowType, int NUMPY_TYPE> | |
830 | Status AppendNdarrayTyped(PyArrayObject* ndarray) { | |
831 | // no need to go through the conversion | |
832 | using NumpyTrait = internal::npy_traits<NUMPY_TYPE>; | |
833 | using NumpyType = typename NumpyTrait::value_type; | |
834 | using ValueBuilderType = typename TypeTraits<ArrowType>::BuilderType; | |
835 | ||
836 | const bool null_sentinels_possible = | |
837 | // Always treat Numpy's NaT as null | |
838 | NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || | |
839 | // Observing pandas's null sentinels | |
840 | (this->options_.from_pandas && NumpyTrait::supports_nulls); | |
841 | ||
842 | auto value_builder = | |
843 | checked_cast<ValueBuilderType*>(this->value_converter_->builder().get()); | |
844 | ||
845 | Ndarray1DIndexer<NumpyType> values(ndarray); | |
846 | if (null_sentinels_possible) { | |
847 | for (int64_t i = 0; i < values.size(); ++i) { | |
848 | if (NumpyTrait::isnull(values[i])) { | |
849 | RETURN_NOT_OK(value_builder->AppendNull()); | |
850 | } else { | |
851 | RETURN_NOT_OK(value_builder->Append(values[i])); | |
852 | } | |
853 | } | |
854 | } else if (!values.is_strided()) { | |
855 | RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size())); | |
856 | } else { | |
857 | for (int64_t i = 0; i < values.size(); ++i) { | |
858 | RETURN_NOT_OK(value_builder->Append(values[i])); | |
859 | } | |
860 | } | |
861 | return Status::OK(); | |
862 | } | |
863 | }; | |
864 | ||
865 | class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait> { | |
866 | public: | |
867 | Status Append(PyObject* value) override { | |
868 | if (PyValue::IsNull(this->options_, value)) { | |
869 | return this->struct_builder_->AppendNull(); | |
870 | } | |
871 | switch (input_kind_) { | |
872 | case InputKind::DICT: | |
873 | RETURN_NOT_OK(this->struct_builder_->Append()); | |
874 | return AppendDict(value); | |
875 | case InputKind::TUPLE: | |
876 | RETURN_NOT_OK(this->struct_builder_->Append()); | |
877 | return AppendTuple(value); | |
878 | case InputKind::ITEMS: | |
879 | RETURN_NOT_OK(this->struct_builder_->Append()); | |
880 | return AppendItems(value); | |
881 | default: | |
882 | RETURN_NOT_OK(InferInputKind(value)); | |
883 | return Append(value); | |
884 | } | |
885 | } | |
886 | ||
887 | protected: | |
888 | Status Init(MemoryPool* pool) override { | |
889 | RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool))); | |
890 | ||
891 | // Store the field names as a PyObjects for dict matching | |
892 | num_fields_ = this->struct_type_->num_fields(); | |
893 | bytes_field_names_.reset(PyList_New(num_fields_)); | |
894 | unicode_field_names_.reset(PyList_New(num_fields_)); | |
895 | RETURN_IF_PYERROR(); | |
896 | ||
897 | for (int i = 0; i < num_fields_; i++) { | |
898 | const auto& field_name = this->struct_type_->field(i)->name(); | |
899 | PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); | |
900 | PyObject* unicode = | |
901 | PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); | |
902 | RETURN_IF_PYERROR(); | |
903 | PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes); | |
904 | PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode); | |
905 | } | |
906 | return Status::OK(); | |
907 | } | |
908 | ||
909 | Status InferInputKind(PyObject* value) { | |
910 | // Infer input object's type, note that heterogeneous sequences are not allowed | |
911 | if (PyDict_Check(value)) { | |
912 | input_kind_ = InputKind::DICT; | |
913 | } else if (PyTuple_Check(value)) { | |
914 | input_kind_ = InputKind::TUPLE; | |
915 | } else if (PySequence_Check(value)) { | |
916 | input_kind_ = InputKind::ITEMS; | |
917 | } else { | |
918 | return internal::InvalidType(value, | |
919 | "was not a dict, tuple, or recognized null value " | |
920 | "for conversion to struct type"); | |
921 | } | |
922 | return Status::OK(); | |
923 | } | |
924 | ||
925 | Status InferKeyKind(PyObject* items) { | |
926 | for (int i = 0; i < PySequence_Length(items); i++) { | |
927 | // retrieve the key from the passed key-value pairs | |
928 | ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); | |
929 | ||
930 | // check key exists between the unicode field names | |
931 | bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first); | |
932 | RETURN_IF_PYERROR(); | |
933 | if (do_contain) { | |
934 | key_kind_ = KeyKind::UNICODE; | |
935 | return Status::OK(); | |
936 | } | |
937 | ||
938 | // check key exists between the bytes field names | |
939 | do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first); | |
940 | RETURN_IF_PYERROR(); | |
941 | if (do_contain) { | |
942 | key_kind_ = KeyKind::BYTES; | |
943 | return Status::OK(); | |
944 | } | |
945 | } | |
946 | return Status::OK(); | |
947 | } | |
948 | ||
949 | Status AppendEmpty() { | |
950 | for (int i = 0; i < num_fields_; i++) { | |
951 | RETURN_NOT_OK(this->children_[i]->Append(Py_None)); | |
952 | } | |
953 | return Status::OK(); | |
954 | } | |
955 | ||
956 | Status AppendTuple(PyObject* tuple) { | |
957 | if (!PyTuple_Check(tuple)) { | |
958 | return internal::InvalidType(tuple, "was expecting a tuple"); | |
959 | } | |
960 | if (PyTuple_GET_SIZE(tuple) != num_fields_) { | |
961 | return Status::Invalid("Tuple size must be equal to number of struct fields"); | |
962 | } | |
963 | for (int i = 0; i < num_fields_; i++) { | |
964 | PyObject* value = PyTuple_GET_ITEM(tuple, i); | |
965 | RETURN_NOT_OK(this->children_[i]->Append(value)); | |
966 | } | |
967 | return Status::OK(); | |
968 | } | |
969 | ||
970 | Status AppendDict(PyObject* dict) { | |
971 | if (!PyDict_Check(dict)) { | |
972 | return internal::InvalidType(dict, "was expecting a dict"); | |
973 | } | |
974 | switch (key_kind_) { | |
975 | case KeyKind::UNICODE: | |
976 | return AppendDict(dict, unicode_field_names_.obj()); | |
977 | case KeyKind::BYTES: | |
978 | return AppendDict(dict, bytes_field_names_.obj()); | |
979 | default: | |
980 | RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict))); | |
981 | if (key_kind_ == KeyKind::UNKNOWN) { | |
982 | // was unable to infer the type which means that all keys are absent | |
983 | return AppendEmpty(); | |
984 | } else { | |
985 | return AppendDict(dict); | |
986 | } | |
987 | } | |
988 | } | |
989 | ||
990 | Status AppendItems(PyObject* items) { | |
991 | if (!PySequence_Check(items)) { | |
992 | return internal::InvalidType(items, "was expecting a sequence of key-value items"); | |
993 | } | |
994 | switch (key_kind_) { | |
995 | case KeyKind::UNICODE: | |
996 | return AppendItems(items, unicode_field_names_.obj()); | |
997 | case KeyKind::BYTES: | |
998 | return AppendItems(items, bytes_field_names_.obj()); | |
999 | default: | |
1000 | RETURN_NOT_OK(InferKeyKind(items)); | |
1001 | if (key_kind_ == KeyKind::UNKNOWN) { | |
1002 | // was unable to infer the type which means that all keys are absent | |
1003 | return AppendEmpty(); | |
1004 | } else { | |
1005 | return AppendItems(items); | |
1006 | } | |
1007 | } | |
1008 | } | |
1009 | ||
1010 | Status AppendDict(PyObject* dict, PyObject* field_names) { | |
1011 | // NOTE we're ignoring any extraneous dict items | |
1012 | for (int i = 0; i < num_fields_; i++) { | |
1013 | PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed | |
1014 | PyObject* value = PyDict_GetItem(dict, name); // borrowed | |
1015 | if (value == NULL) { | |
1016 | RETURN_IF_PYERROR(); | |
1017 | } | |
1018 | RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); | |
1019 | } | |
1020 | return Status::OK(); | |
1021 | } | |
1022 | ||
1023 | Result<std::pair<PyObject*, PyObject*>> GetKeyValuePair(PyObject* seq, int index) { | |
1024 | PyObject* pair = PySequence_GetItem(seq, index); | |
1025 | RETURN_IF_PYERROR(); | |
1026 | if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) { | |
1027 | return internal::InvalidType(pair, "was expecting tuple of (key, value) pair"); | |
1028 | } | |
1029 | PyObject* key = PyTuple_GetItem(pair, 0); | |
1030 | RETURN_IF_PYERROR(); | |
1031 | PyObject* value = PyTuple_GetItem(pair, 1); | |
1032 | RETURN_IF_PYERROR(); | |
1033 | return std::make_pair(key, value); | |
1034 | } | |
1035 | ||
1036 | Status AppendItems(PyObject* items, PyObject* field_names) { | |
1037 | auto length = static_cast<int>(PySequence_Size(items)); | |
1038 | RETURN_IF_PYERROR(); | |
1039 | ||
1040 | // append the values for the defined fields | |
1041 | for (int i = 0; i < std::min(num_fields_, length); i++) { | |
1042 | // retrieve the key-value pair | |
1043 | ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); | |
1044 | ||
1045 | // validate that the key and the field name are equal | |
1046 | PyObject* name = PyList_GET_ITEM(field_names, i); | |
1047 | bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ); | |
1048 | RETURN_IF_PYERROR(); | |
1049 | ||
1050 | // finally append to the respective child builder | |
1051 | if (are_equal) { | |
1052 | RETURN_NOT_OK(this->children_[i]->Append(pair.second)); | |
1053 | } else { | |
1054 | ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first)); | |
1055 | ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name)); | |
1056 | return Status::Invalid("The expected field name is `", name_view.bytes, "` but `", | |
1057 | key_view.bytes, "` was given"); | |
1058 | } | |
1059 | } | |
1060 | // insert null values for missing fields | |
1061 | for (int i = length; i < num_fields_; i++) { | |
1062 | RETURN_NOT_OK(this->children_[i]->AppendNull()); | |
1063 | } | |
1064 | return Status::OK(); | |
1065 | } | |
1066 | ||
1067 | // Whether we're converting from a sequence of dicts or tuples or list of pairs | |
1068 | enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; | |
1069 | // Whether the input dictionary keys' type is python bytes or unicode | |
1070 | enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN; | |
1071 | // Store the field names as a PyObjects for dict matching | |
1072 | OwnedRef bytes_field_names_; | |
1073 | OwnedRef unicode_field_names_; | |
1074 | // Store the number of fields for later reuse | |
1075 | int num_fields_; | |
1076 | }; | |
1077 | ||
1078 | // Convert *obj* to a sequence if necessary | |
1079 | // Fill *size* to its length. If >= 0 on entry, *size* is an upper size | |
1080 | // bound that may lead to truncation. | |
1081 | Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) { | |
1082 | if (PySequence_Check(obj)) { | |
1083 | // obj is already a sequence | |
1084 | int64_t real_size = static_cast<int64_t>(PySequence_Size(obj)); | |
1085 | if (*size < 0) { | |
1086 | *size = real_size; | |
1087 | } else { | |
1088 | *size = std::min(real_size, *size); | |
1089 | } | |
1090 | Py_INCREF(obj); | |
1091 | *seq = obj; | |
1092 | } else if (*size < 0) { | |
1093 | // unknown size, exhaust iterator | |
1094 | *seq = PySequence_List(obj); | |
1095 | RETURN_IF_PYERROR(); | |
1096 | *size = static_cast<int64_t>(PyList_GET_SIZE(*seq)); | |
1097 | } else { | |
1098 | // size is known but iterator could be infinite | |
1099 | Py_ssize_t i, n = *size; | |
1100 | PyObject* iter = PyObject_GetIter(obj); | |
1101 | RETURN_IF_PYERROR(); | |
1102 | OwnedRef iter_ref(iter); | |
1103 | PyObject* lst = PyList_New(n); | |
1104 | RETURN_IF_PYERROR(); | |
1105 | for (i = 0; i < n; i++) { | |
1106 | PyObject* item = PyIter_Next(iter); | |
1107 | if (!item) break; | |
1108 | PyList_SET_ITEM(lst, i, item); | |
1109 | } | |
1110 | // Shrink list if len(iterator) < size | |
1111 | if (i < n && PyList_SetSlice(lst, i, n, NULL)) { | |
1112 | Py_DECREF(lst); | |
1113 | return Status::UnknownError("failed to resize list"); | |
1114 | } | |
1115 | *seq = lst; | |
1116 | *size = std::min<int64_t>(i, *size); | |
1117 | } | |
1118 | return Status::OK(); | |
1119 | } | |
1120 | ||
1121 | } // namespace | |
1122 | ||
1123 | Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* mask, | |
1124 | PyConversionOptions options, | |
1125 | MemoryPool* pool) { | |
1126 | PyAcquireGIL lock; | |
1127 | ||
1128 | PyObject* seq; | |
1129 | OwnedRef tmp_seq_nanny; | |
1130 | ||
1131 | ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas")); | |
1132 | if (is_pandas_imported) { | |
1133 | // If pandas has been already imported initialize the static pandas objects to | |
1134 | // support converting from pd.Timedelta and pd.Timestamp objects | |
1135 | internal::InitPandasStaticData(); | |
1136 | } | |
1137 | ||
1138 | int64_t size = options.size; | |
1139 | RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); | |
1140 | tmp_seq_nanny.reset(seq); | |
1141 | ||
1142 | // In some cases, type inference may be "loose", like strings. If the user | |
1143 | // passed pa.string(), then we will error if we encounter any non-UTF8 | |
1144 | // value. If not, then we will allow the result to be a BinaryArray | |
1145 | if (options.type == nullptr) { | |
1146 | ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); | |
1147 | options.strict = false; | |
1148 | } else { | |
1149 | options.strict = true; | |
1150 | } | |
1151 | DCHECK_GE(size, 0); | |
1152 | ||
1153 | ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>( | |
1154 | options.type, options, pool))); | |
1155 | if (converter->may_overflow()) { | |
1156 | // The converter hierarchy contains binary- or list-like builders which can overflow | |
1157 | // depending on the input values. Wrap the converter with a chunker which detects | |
1158 | // the overflow and automatically creates new chunks. | |
1159 | ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); | |
1160 | if (mask != nullptr && mask != Py_None) { | |
1161 | RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); | |
1162 | } else { | |
1163 | RETURN_NOT_OK(chunked_converter->Extend(seq, size)); | |
1164 | } | |
1165 | return chunked_converter->ToChunkedArray(); | |
1166 | } else { | |
1167 | // If the converter can't overflow spare the capacity error checking on the hot-path, | |
1168 | // this improves the performance roughly by ~10% for primitive types. | |
1169 | if (mask != nullptr && mask != Py_None) { | |
1170 | RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); | |
1171 | } else { | |
1172 | RETURN_NOT_OK(converter->Extend(seq, size)); | |
1173 | } | |
1174 | return converter->ToChunkedArray(); | |
1175 | } | |
1176 | } | |
1177 | ||
1178 | } // namespace py | |
1179 | } // namespace arrow |