1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
25 import hypothesis
as h
30 from pyarrow
.pandas_compat
import _pandas_api
# noqa
32 import pyarrow
.tests
.strategies
as past
37 (np
.int16
, pa
.int16()),
38 (np
.int32
, pa
.int32()),
39 (np
.int64
, pa
.int64()),
40 (np
.uint8
, pa
.uint8()),
41 (np
.uint16
, pa
.uint16()),
42 (np
.uint32
, pa
.uint32()),
43 (np
.uint64
, pa
.uint64())]
46 np_int_types
, pa_int_types
= zip(*int_type_pairs
)
49 class StrangeIterable
:
50 def __init__(self
, lst
):
54 return self
.lst
.__iter
__()
58 def __init__(self
, value
):
70 def check_struct_type(ty
, expected
):
72 Check a struct type is as expected, but not taking order into account.
74 assert pa
.types
.is_struct(ty
)
75 assert set(ty
) == set(expected
)
78 def test_iterable_types():
79 arr1
= pa
.array(StrangeIterable([0, 1, 2, 3]))
80 arr2
= pa
.array((0, 1, 2, 3))
82 assert arr1
.equals(arr2
)
85 def test_empty_iterable():
86 arr
= pa
.array(StrangeIterable([]))
88 assert arr
.null_count
== 0
89 assert arr
.type == pa
.null()
90 assert arr
.to_pylist() == []
93 def test_limited_iterator_types():
94 arr1
= pa
.array(iter(range(3)), type=pa
.int64(), size
=3)
95 arr2
= pa
.array((0, 1, 2))
96 assert arr1
.equals(arr2
)
99 def test_limited_iterator_size_overflow():
100 arr1
= pa
.array(iter(range(3)), type=pa
.int64(), size
=2)
101 arr2
= pa
.array((0, 1))
102 assert arr1
.equals(arr2
)
105 def test_limited_iterator_size_underflow():
106 arr1
= pa
.array(iter(range(3)), type=pa
.int64(), size
=10)
107 arr2
= pa
.array((0, 1, 2))
108 assert arr1
.equals(arr2
)
111 def test_iterator_without_size():
112 expected
= pa
.array((0, 1, 2))
113 arr1
= pa
.array(iter(range(3)))
114 assert arr1
.equals(expected
)
115 # Same with explicit type
116 arr1
= pa
.array(iter(range(3)), type=pa
.int64())
117 assert arr1
.equals(expected
)
120 def test_infinite_iterator():
121 expected
= pa
.array((0, 1, 2))
122 arr1
= pa
.array(itertools
.count(0), size
=3)
123 assert arr1
.equals(expected
)
124 # Same with explicit type
125 arr1
= pa
.array(itertools
.count(0), type=pa
.int64(), size
=3)
126 assert arr1
.equals(expected
)
138 # deque is a sequence while neither tuple nor list
139 return collections
.deque(xs
)
142 def _as_dict_values(xs
):
143 # a dict values object is not a sequence, just a regular iterable
144 dct
= {k
: v
for k
, v
in enumerate(xs
)}
148 def _as_numpy_array(xs
):
149 arr
= np
.empty(len(xs
), dtype
=object)
158 SEQUENCE_TYPES
= [_as_list
, _as_tuple
, _as_numpy_array
]
159 ITERABLE_TYPES
= [_as_set
, _as_dict_values
] + SEQUENCE_TYPES
160 COLLECTIONS_TYPES
= [_as_deque
] + ITERABLE_TYPES
162 parametrize_with_iterable_types
= pytest
.mark
.parametrize(
163 "seq", ITERABLE_TYPES
166 parametrize_with_sequence_types
= pytest
.mark
.parametrize(
167 "seq", SEQUENCE_TYPES
170 parametrize_with_collections_types
= pytest
.mark
.parametrize(
171 "seq", COLLECTIONS_TYPES
175 @parametrize_with_collections_types
176 def test_sequence_types(seq
):
177 arr1
= pa
.array(seq([1, 2, 3]))
178 arr2
= pa
.array([1, 2, 3])
180 assert arr1
.equals(arr2
)
183 @parametrize_with_iterable_types
184 def test_nested_sequence_types(seq
):
185 arr1
= pa
.array([seq([1, 2, 3])])
186 arr2
= pa
.array([[1, 2, 3]])
188 assert arr1
.equals(arr2
)
191 @parametrize_with_sequence_types
192 def test_sequence_boolean(seq
):
193 expected
= [True, None, False, None]
194 arr
= pa
.array(seq(expected
))
196 assert arr
.null_count
== 2
197 assert arr
.type == pa
.bool_()
198 assert arr
.to_pylist() == expected
201 @parametrize_with_sequence_types
202 def test_sequence_numpy_boolean(seq
):
203 expected
= [np
.bool_(True), None, np
.bool_(False), None]
204 arr
= pa
.array(seq(expected
))
205 assert arr
.type == pa
.bool_()
206 assert arr
.to_pylist() == [True, None, False, None]
209 @parametrize_with_sequence_types
210 def test_sequence_mixed_numpy_python_bools(seq
):
211 values
= np
.array([True, False])
212 arr
= pa
.array(seq([values
[0], None, values
[1], True, False]))
213 assert arr
.type == pa
.bool_()
214 assert arr
.to_pylist() == [True, None, False, True, False]
217 @parametrize_with_collections_types
218 def test_empty_list(seq
):
219 arr
= pa
.array(seq([]))
221 assert arr
.null_count
== 0
222 assert arr
.type == pa
.null()
223 assert arr
.to_pylist() == []
226 @parametrize_with_sequence_types
227 def test_nested_lists(seq
):
228 data
= [[], [1, 2], None]
229 arr
= pa
.array(seq(data
))
231 assert arr
.null_count
== 1
232 assert arr
.type == pa
.list_(pa
.int64())
233 assert arr
.to_pylist() == data
235 arr
= pa
.array(seq(data
), type=pa
.list_(pa
.int32()))
237 assert arr
.null_count
== 1
238 assert arr
.type == pa
.list_(pa
.int32())
239 assert arr
.to_pylist() == data
242 @parametrize_with_sequence_types
243 def test_nested_large_lists(seq
):
244 data
= [[], [1, 2], None]
245 arr
= pa
.array(seq(data
), type=pa
.large_list(pa
.int16()))
247 assert arr
.null_count
== 1
248 assert arr
.type == pa
.large_list(pa
.int16())
249 assert arr
.to_pylist() == data
252 @parametrize_with_collections_types
253 def test_list_with_non_list(seq
):
254 # List types don't accept non-sequences
255 with pytest
.raises(TypeError):
256 pa
.array(seq([[], [1, 2], 3]), type=pa
.list_(pa
.int64()))
257 with pytest
.raises(TypeError):
258 pa
.array(seq([[], [1, 2], 3]), type=pa
.large_list(pa
.int64()))
261 @parametrize_with_sequence_types
262 def test_nested_arrays(seq
):
263 arr
= pa
.array(seq([np
.array([], dtype
=np
.int64
),
264 np
.array([1, 2], dtype
=np
.int64
), None]))
266 assert arr
.null_count
== 1
267 assert arr
.type == pa
.list_(pa
.int64())
268 assert arr
.to_pylist() == [[], [1, 2], None]
271 @parametrize_with_sequence_types
272 def test_nested_fixed_size_list(seq
):
274 data
= [[1, 2], [3, None], None]
275 arr
= pa
.array(seq(data
), type=pa
.list_(pa
.int64(), 2))
277 assert arr
.null_count
== 1
278 assert arr
.type == pa
.list_(pa
.int64(), 2)
279 assert arr
.to_pylist() == data
281 # sequence of numpy arrays
282 data
= [np
.array([1, 2], dtype
='int64'), np
.array([3, 4], dtype
='int64'),
284 arr
= pa
.array(seq(data
), type=pa
.list_(pa
.int64(), 2))
286 assert arr
.null_count
== 1
287 assert arr
.type == pa
.list_(pa
.int64(), 2)
288 assert arr
.to_pylist() == [[1, 2], [3, 4], None]
290 # incorrect length of the lists or arrays
291 data
= [[1, 2, 4], [3, None], None]
292 for data
in [[[1, 2, 3]], [np
.array([1, 2, 4], dtype
='int64')]]:
294 ValueError, match
="Length of item not correct: expected 2"):
295 pa
.array(seq(data
), type=pa
.list_(pa
.int64(), 2))
297 # with list size of 0
298 data
= [[], [], None]
299 arr
= pa
.array(seq(data
), type=pa
.list_(pa
.int64(), 0))
301 assert arr
.null_count
== 1
302 assert arr
.type == pa
.list_(pa
.int64(), 0)
303 assert arr
.to_pylist() == [[], [], None]
306 @parametrize_with_sequence_types
307 def test_sequence_all_none(seq
):
308 arr
= pa
.array(seq([None, None]))
310 assert arr
.null_count
== 2
311 assert arr
.type == pa
.null()
312 assert arr
.to_pylist() == [None, None]
315 @parametrize_with_sequence_types
316 @pytest.mark
.parametrize("np_scalar_pa_type", int_type_pairs
)
317 def test_sequence_integer(seq
, np_scalar_pa_type
):
318 np_scalar
, pa_type
= np_scalar_pa_type
319 expected
= [1, None, 3, None,
320 np
.iinfo(np_scalar
).min, np
.iinfo(np_scalar
).max]
321 arr
= pa
.array(seq(expected
), type=pa_type
)
323 assert arr
.null_count
== 2
324 assert arr
.type == pa_type
325 assert arr
.to_pylist() == expected
328 @parametrize_with_collections_types
329 @pytest.mark
.parametrize("np_scalar_pa_type", int_type_pairs
)
330 def test_sequence_integer_np_nan(seq
, np_scalar_pa_type
):
331 # ARROW-2806: numpy.nan is a double value and thus should produce
333 _
, pa_type
= np_scalar_pa_type
334 with pytest
.raises(ValueError):
335 pa
.array(seq([np
.nan
]), type=pa_type
, from_pandas
=False)
337 arr
= pa
.array(seq([np
.nan
]), type=pa_type
, from_pandas
=True)
340 assert arr
.null_count
== 1
341 assert arr
.type == pa_type
342 assert arr
.to_pylist() == expected
345 @parametrize_with_sequence_types
346 @pytest.mark
.parametrize("np_scalar_pa_type", int_type_pairs
)
347 def test_sequence_integer_nested_np_nan(seq
, np_scalar_pa_type
):
348 # ARROW-2806: numpy.nan is a double value and thus should produce
350 _
, pa_type
= np_scalar_pa_type
351 with pytest
.raises(ValueError):
352 pa
.array(seq([[np
.nan
]]), type=pa
.list_(pa_type
), from_pandas
=False)
354 arr
= pa
.array(seq([[np
.nan
]]), type=pa
.list_(pa_type
), from_pandas
=True)
357 assert arr
.null_count
== 0
358 assert arr
.type == pa
.list_(pa_type
)
359 assert arr
.to_pylist() == expected
362 @parametrize_with_sequence_types
363 def test_sequence_integer_inferred(seq
):
364 expected
= [1, None, 3, None]
365 arr
= pa
.array(seq(expected
))
367 assert arr
.null_count
== 2
368 assert arr
.type == pa
.int64()
369 assert arr
.to_pylist() == expected
372 @parametrize_with_sequence_types
373 @pytest.mark
.parametrize("np_scalar_pa_type", int_type_pairs
)
374 def test_sequence_numpy_integer(seq
, np_scalar_pa_type
):
375 np_scalar
, pa_type
= np_scalar_pa_type
376 expected
= [np_scalar(1), None, np_scalar(3), None,
377 np_scalar(np
.iinfo(np_scalar
).min),
378 np_scalar(np
.iinfo(np_scalar
).max)]
379 arr
= pa
.array(seq(expected
), type=pa_type
)
381 assert arr
.null_count
== 2
382 assert arr
.type == pa_type
383 assert arr
.to_pylist() == expected
386 @parametrize_with_sequence_types
387 @pytest.mark
.parametrize("np_scalar_pa_type", int_type_pairs
)
388 def test_sequence_numpy_integer_inferred(seq
, np_scalar_pa_type
):
389 np_scalar
, pa_type
= np_scalar_pa_type
390 expected
= [np_scalar(1), None, np_scalar(3), None]
391 expected
+= [np_scalar(np
.iinfo(np_scalar
).min),
392 np_scalar(np
.iinfo(np_scalar
).max)]
393 arr
= pa
.array(seq(expected
))
395 assert arr
.null_count
== 2
396 assert arr
.type == pa_type
397 assert arr
.to_pylist() == expected
400 @parametrize_with_sequence_types
401 def test_sequence_custom_integers(seq
):
402 expected
= [0, 42, 2**33 + 1, -2**63]
403 data
= list(map(MyInt
, expected
))
404 arr
= pa
.array(seq(data
), type=pa
.int64())
405 assert arr
.to_pylist() == expected
408 @parametrize_with_collections_types
409 def test_broken_integers(seq
):
410 data
= [MyBrokenInt()]
411 with pytest
.raises(pa
.ArrowInvalid
, match
="tried to convert to int"):
412 pa
.array(seq(data
), type=pa
.int64())
415 def test_numpy_scalars_mixed_type():
417 data
= [np
.int32(10), np
.float32(0.5)]
419 expected
= pa
.array([10, 0.5], type="float64")
420 assert arr
.equals(expected
)
423 data
= [np
.int8(10), np
.float32(0.5)]
425 expected
= pa
.array([10, 0.5], type="float32")
426 assert arr
.equals(expected
)
429 @pytest.mark
.xfail(reason
="Type inference for uint64 not implemented",
430 raises
=OverflowError)
431 def test_uint64_max_convert():
432 data
= [0, np
.iinfo(np
.uint64
).max]
434 arr
= pa
.array(data
, type=pa
.uint64())
435 expected
= pa
.array(np
.array(data
, dtype
='uint64'))
436 assert arr
.equals(expected
)
438 arr_inferred
= pa
.array(data
)
439 assert arr_inferred
.equals(expected
)
442 @pytest.mark
.parametrize("bits", [8, 16, 32, 64])
443 def test_signed_integer_overflow(bits
):
444 ty
= getattr(pa
, "int%d" % bits
)()
445 # XXX ideally would always raise OverflowError
446 with pytest
.raises((OverflowError, pa
.ArrowInvalid
)):
447 pa
.array([2 ** (bits
- 1)], ty
)
448 with pytest
.raises((OverflowError, pa
.ArrowInvalid
)):
449 pa
.array([-2 ** (bits
- 1) - 1], ty
)
452 @pytest.mark
.parametrize("bits", [8, 16, 32, 64])
453 def test_unsigned_integer_overflow(bits
):
454 ty
= getattr(pa
, "uint%d" % bits
)()
455 # XXX ideally would always raise OverflowError
456 with pytest
.raises((OverflowError, pa
.ArrowInvalid
)):
457 pa
.array([2 ** bits
], ty
)
458 with pytest
.raises((OverflowError, pa
.ArrowInvalid
)):
462 @parametrize_with_collections_types
463 @pytest.mark
.parametrize("typ", pa_int_types
)
464 def test_integer_from_string_error(seq
, typ
):
465 # ARROW-9451: pa.array(['1'], type=pa.uint32()) should not succeed
466 with pytest
.raises(pa
.ArrowInvalid
):
467 pa
.array(seq(['1']), type=typ
)
470 def test_convert_with_mask():
471 data
= [1, 2, 3, 4, 5]
472 mask
= np
.array([False, True, False, False, True])
474 result
= pa
.array(data
, mask
=mask
)
475 expected
= pa
.array([1, None, 3, 4, None])
477 assert result
.equals(expected
)
480 with pytest
.raises(ValueError):
481 pa
.array(data
, mask
=mask
[1:])
484 def test_garbage_collection():
487 # Force the cyclic garbage collector to run
490 bytes_before
= pa
.total_allocated_bytes()
491 pa
.array([1, None, 3, None])
493 assert pa
.total_allocated_bytes() == bytes_before
496 def test_sequence_double():
497 data
= [1.5, 1., None, 2.5, None, None]
500 assert arr
.null_count
== 3
501 assert arr
.type == pa
.float64()
502 assert arr
.to_pylist() == data
505 def test_double_auto_coerce_from_integer():
506 # Done as part of ARROW-2814
507 data
= [1.5, 1., None, 2.5, None, None]
510 data2
= [1.5, 1, None, 2.5, None, None]
511 arr2
= pa
.array(data2
)
513 assert arr
.equals(arr2
)
515 data3
= [1, 1.5, None, 2.5, None, None]
516 arr3
= pa
.array(data3
)
518 data4
= [1., 1.5, None, 2.5, None, None]
519 arr4
= pa
.array(data4
)
521 assert arr3
.equals(arr4
)
524 def test_double_integer_coerce_representable_range():
525 valid_values
= [1.5, 1, 2, None, 1 << 53, -(1 << 53)]
526 invalid_values
= [1.5, 1, 2, None, (1 << 53) + 1]
527 invalid_values2
= [1.5, 1, 2, None, -((1 << 53) + 1)]
530 pa
.array(valid_values
)
533 with pytest
.raises(ValueError):
534 pa
.array(invalid_values
)
536 with pytest
.raises(ValueError):
537 pa
.array(invalid_values2
)
540 def test_float32_integer_coerce_representable_range():
542 valid_values
= [f32(1.5), 1 << 24, -(1 << 24)]
543 invalid_values
= [f32(1.5), (1 << 24) + 1]
544 invalid_values2
= [f32(1.5), -((1 << 24) + 1)]
547 pa
.array(valid_values
, type=pa
.float32())
550 with pytest
.raises(ValueError):
551 pa
.array(invalid_values
, type=pa
.float32())
553 with pytest
.raises(ValueError):
554 pa
.array(invalid_values2
, type=pa
.float32())
557 def test_mixed_sequence_errors():
558 with pytest
.raises(ValueError, match
="tried to convert to boolean"):
559 pa
.array([True, 'foo'], type=pa
.bool_())
561 with pytest
.raises(ValueError, match
="tried to convert to float32"):
562 pa
.array([1.5, 'foo'], type=pa
.float32())
564 with pytest
.raises(ValueError, match
="tried to convert to double"):
565 pa
.array([1.5, 'foo'])
568 @parametrize_with_sequence_types
569 @pytest.mark
.parametrize("np_scalar,pa_type", [
570 (np
.float16
, pa
.float16()),
571 (np
.float32
, pa
.float32()),
572 (np
.float64
, pa
.float64())
574 @pytest.mark
.parametrize("from_pandas", [True, False])
575 def test_sequence_numpy_double(seq
, np_scalar
, pa_type
, from_pandas
):
576 data
= [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np
.nan
]
577 arr
= pa
.array(seq(data
), from_pandas
=from_pandas
)
580 assert arr
.null_count
== 3
582 assert arr
.null_count
== 2
584 # The NaN is skipped in type inference, otherwise it forces a
586 assert arr
.type == pa_type
588 assert arr
.type == pa
.float64()
590 assert arr
.to_pylist()[:4] == data
[:4]
592 assert arr
.to_pylist()[5] is None
594 assert np
.isnan(arr
.to_pylist()[5])
597 @pytest.mark
.parametrize("from_pandas", [True, False])
598 @pytest.mark
.parametrize("inner_seq", [np
.array
, list])
599 def test_ndarray_nested_numpy_double(from_pandas
, inner_seq
):
603 inner_seq([1., 2., 3.]),
607 arr
= pa
.array(data
, from_pandas
=from_pandas
)
609 assert arr
.null_count
== 1
610 assert arr
.type == pa
.list_(pa
.float64())
612 assert arr
.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
614 np
.testing
.assert_equal(arr
.to_pylist(),
615 [[1., 2.], [1., 2., 3.], [np
.nan
], None])
618 def test_nested_ndarray_in_object_array():
620 arr
= np
.empty(2, dtype
=object)
621 arr
[:] = [np
.array([1, 2], dtype
=np
.int64
),
622 np
.array([2, 3], dtype
=np
.int64
)]
624 arr2
= np
.empty(2, dtype
=object)
628 expected_type
= pa
.list_(pa
.list_(pa
.int64()))
629 assert pa
.infer_type([arr
]) == expected_type
631 result
= pa
.array([arr
, arr2
])
632 expected
= pa
.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
635 assert result
.equals(expected
)
637 # test case for len-1 arrays to ensure they are interpreted as
638 # sublists and not scalars
639 arr
= np
.empty(2, dtype
=object)
640 arr
[:] = [np
.array([1]), np
.array([2])]
641 result
= pa
.array([arr
, arr
])
642 assert result
.to_pylist() == [[[1], [2]], [[1], [2]]]
645 @pytest.mark
.xfail(reason
=("Type inference for multidimensional ndarray "
646 "not yet implemented"),
647 raises
=AssertionError)
648 def test_multidimensional_ndarray_as_nested_list():
649 # TODO(wesm): see ARROW-5645
650 arr
= np
.array([[1, 2], [2, 3]], dtype
=np
.int64
)
651 arr2
= np
.array([[3, 4], [5, 6]], dtype
=np
.int64
)
653 expected_type
= pa
.list_(pa
.list_(pa
.int64()))
654 assert pa
.infer_type([arr
]) == expected_type
656 result
= pa
.array([arr
, arr2
])
657 expected
= pa
.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
660 assert result
.equals(expected
)
663 @pytest.mark
.parametrize(('data', 'value_type'), [
664 ([True, False], pa
.bool_()),
665 ([None, None], pa
.null()),
666 ([1, 2, None], pa
.int8()),
667 ([1, 2., 3., None], pa
.float32()),
668 ([datetime
.date
.today(), None], pa
.date32()),
669 ([None, datetime
.date
.today()], pa
.date64()),
670 ([datetime
.time(1, 1, 1), None], pa
.time32('s')),
671 ([None, datetime
.time(2, 2, 2)], pa
.time64('us')),
672 ([datetime
.datetime
.now(), None], pa
.timestamp('us')),
673 ([datetime
.timedelta(seconds
=10)], pa
.duration('s')),
674 ([b
"a", b
"b"], pa
.binary()),
675 ([b
"aaa", b
"bbb", b
"ccc"], pa
.binary(3)),
676 ([b
"a", b
"b", b
"c"], pa
.large_binary()),
677 (["a", "b", "c"], pa
.string()),
678 (["a", "b", "c"], pa
.large_string()),
680 [{"a": 1, "b": 2}, None, {"a": 5, "b": None}],
681 pa
.struct([('a', pa
.int8()), ('b', pa
.int16())])
684 def test_list_array_from_object_ndarray(data
, value_type
):
685 ty
= pa
.list_(value_type
)
686 ndarray
= np
.array(data
, dtype
=object)
687 arr
= pa
.array([ndarray
], type=ty
)
688 assert arr
.type.equals(ty
)
689 assert arr
.to_pylist() == [data
]
692 @pytest.mark
.parametrize(('data', 'value_type'), [
693 ([[1, 2], [3]], pa
.list_(pa
.int64())),
694 ([[1, 2], [3, 4]], pa
.list_(pa
.int64(), 2)),
695 ([[1], [2, 3]], pa
.large_list(pa
.int64()))
697 def test_nested_list_array_from_object_ndarray(data
, value_type
):
698 ndarray
= np
.empty(len(data
), dtype
=object)
699 ndarray
[:] = [np
.array(item
, dtype
=object) for item
in data
]
701 ty
= pa
.list_(value_type
)
702 arr
= pa
.array([ndarray
], type=ty
)
703 assert arr
.type.equals(ty
)
704 assert arr
.to_pylist() == [data
]
707 def test_array_ignore_nan_from_pandas():
708 # See ARROW-4324, this reverts logic that was introduced in
710 with pytest
.raises(ValueError):
711 pa
.array([np
.nan
, 'str'])
713 arr
= pa
.array([np
.nan
, 'str'], from_pandas
=True)
714 expected
= pa
.array([None, 'str'])
715 assert arr
.equals(expected
)
718 def test_nested_ndarray_different_dtypes():
720 np
.array([1, 2, 3], dtype
='int64'),
722 np
.array([4, 5, 6], dtype
='uint32')
726 expected
= pa
.array([[1, 2, 3], None, [4, 5, 6]],
727 type=pa
.list_(pa
.int64()))
728 assert arr
.equals(expected
)
730 t2
= pa
.list_(pa
.uint32())
731 arr2
= pa
.array(data
, type=t2
)
732 expected2
= expected
.cast(t2
)
733 assert arr2
.equals(expected2
)
736 def test_sequence_unicode():
737 data
= ['foo', 'bar', None, 'mañana']
740 assert arr
.null_count
== 1
741 assert arr
.type == pa
.string()
742 assert arr
.to_pylist() == data
745 def check_array_mixed_unicode_bytes(binary_type
, string_type
):
746 values
= ['qux', b
'foo', bytearray(b
'barz')]
747 b_values
= [b
'qux', b
'foo', b
'barz']
748 u_values
= ['qux', 'foo', 'barz']
750 arr
= pa
.array(values
)
751 expected
= pa
.array(b_values
, type=pa
.binary())
752 assert arr
.type == pa
.binary()
753 assert arr
.equals(expected
)
755 arr
= pa
.array(values
, type=binary_type
)
756 expected
= pa
.array(b_values
, type=binary_type
)
757 assert arr
.type == binary_type
758 assert arr
.equals(expected
)
760 arr
= pa
.array(values
, type=string_type
)
761 expected
= pa
.array(u_values
, type=string_type
)
762 assert arr
.type == string_type
763 assert arr
.equals(expected
)
766 def test_array_mixed_unicode_bytes():
767 check_array_mixed_unicode_bytes(pa
.binary(), pa
.string())
768 check_array_mixed_unicode_bytes(pa
.large_binary(), pa
.large_string())
771 @pytest.mark
.large_memory
772 @pytest.mark
.parametrize("ty", [pa
.large_binary(), pa
.large_string()])
773 def test_large_binary_array(ty
):
774 # Construct a large binary array with more than 4GB of data
775 s
= b
"0123456789abcdefghijklmnopqrstuvwxyz" * 10
776 nrepeats
= math
.ceil((2**32 + 5) / len(s
))
777 data
= [s
] * nrepeats
778 arr
= pa
.array(data
, type=ty
)
779 assert isinstance(arr
, pa
.Array
)
780 assert arr
.type == ty
781 assert len(arr
) == nrepeats
785 @pytest.mark
.large_memory
786 @pytest.mark
.parametrize("ty", [pa
.large_binary(), pa
.large_string()])
787 def test_large_binary_value(ty
):
788 # Construct a large binary array with a single value larger than 4GB
789 s
= b
"0123456789abcdefghijklmnopqrstuvwxyz"
790 nrepeats
= math
.ceil((2**32 + 5) / len(s
))
791 arr
= pa
.array([b
"foo", s
* nrepeats
, None, b
"bar"], type=ty
)
792 assert isinstance(arr
, pa
.Array
)
793 assert arr
.type == ty
795 buf
= arr
[1].as_buffer()
796 assert len(buf
) == len(s
) * nrepeats
799 @pytest.mark
.large_memory
800 @pytest.mark
.parametrize("ty", [pa
.binary(), pa
.string()])
801 def test_string_too_large(ty
):
802 # Construct a binary array with a single value larger than 4GB
803 s
= b
"0123456789abcdefghijklmnopqrstuvwxyz"
804 nrepeats
= math
.ceil((2**32 + 5) / len(s
))
805 with pytest
.raises(pa
.ArrowCapacityError
):
806 pa
.array([b
"foo", s
* nrepeats
, None, b
"bar"], type=ty
)
809 def test_sequence_bytes():
810 u1
= b
'ma\xc3\xb1ana'
814 memoryview(b
'd-a-t-a')[::2], # non-contiguous is made contiguous
815 u1
.decode('utf-8'), # unicode gets encoded,
818 for ty
in [None, pa
.binary(), pa
.large_binary()]:
819 arr
= pa
.array(data
, type=ty
)
821 assert arr
.null_count
== 1
822 assert arr
.type == ty
or pa
.binary()
823 assert arr
.to_pylist() == [b
'foo', b
'dada', b
'data', u1
, b
'bar', None]
826 @pytest.mark
.parametrize("ty", [pa
.string(), pa
.large_string()])
827 def test_sequence_utf8_to_unicode(ty
):
829 data
= [b
'foo', None, b
'bar']
830 arr
= pa
.array(data
, type=ty
)
831 assert arr
.type == ty
832 assert arr
[0].as_py() == 'foo'
834 # test a non-utf8 unicode string
835 val
= ('mañana').encode('utf-16-le')
836 with pytest
.raises(pa
.ArrowInvalid
):
837 pa
.array([val
], type=ty
)
840 def test_sequence_fixed_size_bytes():
841 data
= [b
'foof', None, bytearray(b
'barb'), b
'2346']
842 arr
= pa
.array(data
, type=pa
.binary(4))
844 assert arr
.null_count
== 1
845 assert arr
.type == pa
.binary(4)
846 assert arr
.to_pylist() == [b
'foof', None, b
'barb', b
'2346']
849 def test_fixed_size_bytes_does_not_accept_varying_lengths():
850 data
= [b
'foo', None, b
'barb', b
'2346']
851 with pytest
.raises(pa
.ArrowInvalid
):
852 pa
.array(data
, type=pa
.binary(4))
855 def test_fixed_size_binary_length_check():
857 data
= [b
'\x19h\r\x9e\x00\x00\x00\x00\x01\x9b\x9fA']
858 assert len(data
[0]) == 12
860 arr
= pa
.array(data
, type=ty
)
861 assert arr
.to_pylist() == data
864 def test_sequence_date():
865 data
= [datetime
.date(2000, 1, 1), None, datetime
.date(1970, 1, 1),
866 datetime
.date(2040, 2, 26)]
869 assert arr
.type == pa
.date32()
870 assert arr
.null_count
== 1
871 assert arr
[0].as_py() == datetime
.date(2000, 1, 1)
872 assert arr
[1].as_py() is None
873 assert arr
[2].as_py() == datetime
.date(1970, 1, 1)
874 assert arr
[3].as_py() == datetime
.date(2040, 2, 26)
877 @pytest.mark
.parametrize('input',
878 [(pa
.date32(), [10957, None]),
879 (pa
.date64(), [10957 * 86400000, None])])
880 def test_sequence_explicit_types(input):
882 data
= [datetime
.date(2000, 1, 1), None]
883 arr
= pa
.array(data
, type=t
)
884 arr2
= pa
.array(ex_values
, type=t
)
886 for x
in [arr
, arr2
]:
889 assert x
.null_count
== 1
890 assert x
[0].as_py() == datetime
.date(2000, 1, 1)
891 assert x
[1].as_py() is None
894 def test_date32_overflow():
896 data3
= [2**32, None]
897 with pytest
.raises((OverflowError, pa
.ArrowException
)):
898 pa
.array(data3
, type=pa
.date32())
901 @pytest.mark
.parametrize(('time_type', 'unit', 'int_type'), [
902 (pa
.time32
, 's', 'int32'),
903 (pa
.time32
, 'ms', 'int32'),
904 (pa
.time64
, 'us', 'int64'),
905 (pa
.time64
, 'ns', 'int64'),
907 def test_sequence_time_with_timezone(time_type
, unit
, int_type
):
908 def expected_integer_value(t
):
909 # only use with utc time object because it doesn't adjust with the
911 units
= ['s', 'ms', 'us', 'ns']
912 multiplier
= 10**(units
.index(unit
) * 3)
919 t
.microsecond
* 10**-6
921 return int(seconds
* multiplier
)
923 def expected_time_value(t
):
924 # only use with utc time object because it doesn't adjust with the
925 # time objects tzdata
927 return t
.replace(microsecond
=0)
929 return t
.replace(microsecond
=(t
.microsecond
// 1000) * 1000)
933 # only timezone naive times are supported in arrow
935 datetime
.time(8, 23, 34, 123456),
936 datetime
.time(5, 0, 0, 1000),
938 datetime
.time(1, 11, 56, 432539),
939 datetime
.time(23, 10, 0, 437699)
943 arr
= pa
.array(data
, type=ty
)
945 assert arr
.type == ty
946 assert arr
.null_count
== 1
948 # test that the underlying integers are UTC values
949 values
= arr
.cast(int_type
)
950 expected
= list(map(expected_integer_value
, data
))
951 assert values
.to_pylist() == expected
953 # test that the scalars are datetime.time objects with UTC timezone
954 assert arr
[0].as_py() == expected_time_value(data
[0])
955 assert arr
[1].as_py() == expected_time_value(data
[1])
956 assert arr
[2].as_py() is None
957 assert arr
[3].as_py() == expected_time_value(data
[3])
958 assert arr
[4].as_py() == expected_time_value(data
[4])
960 def tz(hours
, minutes
=0):
961 offset
= datetime
.timedelta(hours
=hours
, minutes
=minutes
)
962 return datetime
.timezone(offset
)
965 def test_sequence_timestamp():
967 datetime
.datetime(2007, 7, 13, 1, 23, 34, 123456),
969 datetime
.datetime(2006, 1, 13, 12, 34, 56, 432539),
970 datetime
.datetime(2010, 8, 13, 5, 46, 57, 437699)
974 assert arr
.type == pa
.timestamp('us')
975 assert arr
.null_count
== 1
976 assert arr
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
978 assert arr
[1].as_py() is None
979 assert arr
[2].as_py() == datetime
.datetime(2006, 1, 13, 12,
981 assert arr
[3].as_py() == datetime
.datetime(2010, 8, 13, 5,
985 @pytest.mark
.parametrize('timezone', [
991 @pytest.mark
.parametrize('unit', [
997 def test_sequence_timestamp_with_timezone(timezone
, unit
):
998 def expected_integer_value(dt
):
999 units
= ['s', 'ms', 'us', 'ns']
1000 multiplier
= 10**(units
.index(unit
) * 3)
1004 # avoid float precision issues
1005 ts
= decimal
.Decimal(str(dt
.timestamp()))
1006 return int(ts
* multiplier
)
1008 def expected_datetime_value(dt
):
1013 dt
= dt
.replace(microsecond
=0)
1015 dt
= dt
.replace(microsecond
=(dt
.microsecond
// 1000) * 1000)
1017 # adjust the timezone
1018 if timezone
is None:
1019 # make datetime timezone unaware
1020 return dt
.replace(tzinfo
=None)
1022 # convert to the expected timezone
1023 return dt
.astimezone(pytz
.timezone(timezone
))
1026 datetime
.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1028 datetime
.datetime(2008, 1, 5, 5, 0, 0, 1000)
1031 pytz
.timezone('US/Eastern').localize(
1032 datetime
.datetime(2006, 1, 13, 12, 34, 56, 432539)
1034 pytz
.timezone('Europe/Moscow').localize(
1035 datetime
.datetime(2010, 8, 13, 5, 0, 0, 437699)
1039 pytz
.utc
.localize(data
[0]),
1042 data
[3].astimezone(pytz
.utc
),
1043 data
[4].astimezone(pytz
.utc
),
1046 ty
= pa
.timestamp(unit
, tz
=timezone
)
1047 arr
= pa
.array(data
, type=ty
)
1048 assert len(arr
) == 5
1049 assert arr
.type == ty
1050 assert arr
.null_count
== 1
1052 # test that the underlying integers are UTC values
1053 values
= arr
.cast('int64')
1054 expected
= list(map(expected_integer_value
, utcdata
))
1055 assert values
.to_pylist() == expected
1057 # test that the scalars are datetimes with the correct timezone
1058 for i
in range(len(arr
)):
1059 assert arr
[i
].as_py() == expected_datetime_value(utcdata
[i
])
1062 @pytest.mark
.parametrize('timezone', [
1068 def test_pyarrow_ignore_timezone_environment_variable(monkeypatch
, timezone
):
1069 # note that any non-empty value will evaluate to true
1070 monkeypatch
.setenv("PYARROW_IGNORE_TIMEZONE", "1")
1072 datetime
.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1074 datetime
.datetime(2008, 1, 5, 5, 0, 0, 1000)
1076 pytz
.timezone('US/Eastern').localize(
1077 datetime
.datetime(2006, 1, 13, 12, 34, 56, 432539)
1079 pytz
.timezone('Europe/Moscow').localize(
1080 datetime
.datetime(2010, 8, 13, 5, 0, 0, 437699)
1084 expected
= [dt
.replace(tzinfo
=None) for dt
in data
]
1085 if timezone
is not None:
1086 tzinfo
= pytz
.timezone(timezone
)
1087 expected
= [tzinfo
.fromutc(dt
) for dt
in expected
]
1089 ty
= pa
.timestamp('us', tz
=timezone
)
1090 arr
= pa
.array(data
, type=ty
)
1091 assert arr
.to_pylist() == expected
1094 def test_sequence_timestamp_with_timezone_inference():
1096 datetime
.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1098 datetime
.datetime(2008, 1, 5, 5, 0, 0, 1000)
1101 pytz
.timezone('US/Eastern').localize(
1102 datetime
.datetime(2006, 1, 13, 12, 34, 56, 432539)
1104 pytz
.timezone('Europe/Moscow').localize(
1105 datetime
.datetime(2010, 8, 13, 5, 0, 0, 437699)
1109 pa
.timestamp('us', tz
=None),
1110 pa
.timestamp('us', tz
='UTC'),
1111 pa
.timestamp('us', tz
=None),
1112 pa
.timestamp('us', tz
='US/Eastern'),
1113 pa
.timestamp('us', tz
='Europe/Moscow')
1115 for dt
, expected_type
in zip(data
, expected
):
1116 prepended
= [dt
] + data
1117 arr
= pa
.array(prepended
)
1118 assert arr
.type == expected_type
1122 def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes():
1126 pd
.Timestamp(1184307814123456123, tz
=pytz
.timezone('US/Eastern'),
1128 datetime
.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1130 datetime
.datetime(2008, 1, 5, 5, 0, 0, 1000)
1135 data
[0].astimezone(pytz
.utc
),
1136 pytz
.utc
.localize(data
[1]),
1137 data
[2].astimezone(pytz
.utc
),
1141 arr
= pa
.array(data
)
1142 assert arr
.type == pa
.timestamp('us', tz
='US/Eastern')
1144 values
= arr
.cast('int64')
1145 expected
= [int(dt
.timestamp() * 10**6) if dt
else None for dt
in utcdata
]
1146 assert values
.to_pylist() == expected
1149 def test_sequence_timestamp_out_of_bounds_nanosecond():
1150 # https://issues.apache.org/jira/browse/ARROW-9768
1151 # datetime outside of range supported for nanosecond resolution
1152 data
= [datetime
.datetime(2262, 4, 12)]
1153 with pytest
.raises(ValueError, match
="out of bounds"):
1154 pa
.array(data
, type=pa
.timestamp('ns'))
1156 # with microsecond resolution it works fine
1157 arr
= pa
.array(data
, type=pa
.timestamp('us'))
1158 assert arr
.to_pylist() == data
1160 # case where the naive is within bounds, but converted to UTC not
1161 tz
= datetime
.timezone(datetime
.timedelta(hours
=-1))
1162 data
= [datetime
.datetime(2262, 4, 11, 23, tzinfo
=tz
)]
1163 with pytest
.raises(ValueError, match
="out of bounds"):
1164 pa
.array(data
, type=pa
.timestamp('ns'))
1166 arr
= pa
.array(data
, type=pa
.timestamp('us'))
1167 assert arr
.to_pylist()[0] == datetime
.datetime(2262, 4, 12)
1170 def test_sequence_numpy_timestamp():
1172 np
.datetime64(datetime
.datetime(2007, 7, 13, 1, 23, 34, 123456)),
1174 np
.datetime64(datetime
.datetime(2006, 1, 13, 12, 34, 56, 432539)),
1175 np
.datetime64(datetime
.datetime(2010, 8, 13, 5, 46, 57, 437699))
1177 arr
= pa
.array(data
)
1178 assert len(arr
) == 4
1179 assert arr
.type == pa
.timestamp('us')
1180 assert arr
.null_count
== 1
1181 assert arr
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
1183 assert arr
[1].as_py() is None
1184 assert arr
[2].as_py() == datetime
.datetime(2006, 1, 13, 12,
1186 assert arr
[3].as_py() == datetime
.datetime(2010, 8, 13, 5,
1190 class MyDate(datetime
.date
):
1194 class MyDatetime(datetime
.datetime
):
1198 class MyTimedelta(datetime
.timedelta
):
1202 def test_datetime_subclassing():
1204 MyDate(2007, 7, 13),
1206 date_type
= pa
.date32()
1207 arr_date
= pa
.array(data
, type=date_type
)
1208 assert len(arr_date
) == 1
1209 assert arr_date
.type == date_type
1210 assert arr_date
[0].as_py() == datetime
.date(2007, 7, 13)
1213 MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
1216 s
= pa
.timestamp('s')
1217 ms
= pa
.timestamp('ms')
1218 us
= pa
.timestamp('us')
1220 arr_s
= pa
.array(data
, type=s
)
1221 assert len(arr_s
) == 1
1222 assert arr_s
.type == s
1223 assert arr_s
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
1226 arr_ms
= pa
.array(data
, type=ms
)
1227 assert len(arr_ms
) == 1
1228 assert arr_ms
.type == ms
1229 assert arr_ms
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
1232 arr_us
= pa
.array(data
, type=us
)
1233 assert len(arr_us
) == 1
1234 assert arr_us
.type == us
1235 assert arr_us
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
1239 MyTimedelta(123, 456, 1002),
1242 s
= pa
.duration('s')
1243 ms
= pa
.duration('ms')
1244 us
= pa
.duration('us')
1246 arr_s
= pa
.array(data
)
1247 assert len(arr_s
) == 1
1248 assert arr_s
.type == us
1249 assert arr_s
[0].as_py() == datetime
.timedelta(123, 456, 1002)
1251 arr_s
= pa
.array(data
, type=s
)
1252 assert len(arr_s
) == 1
1253 assert arr_s
.type == s
1254 assert arr_s
[0].as_py() == datetime
.timedelta(123, 456)
1256 arr_ms
= pa
.array(data
, type=ms
)
1257 assert len(arr_ms
) == 1
1258 assert arr_ms
.type == ms
1259 assert arr_ms
[0].as_py() == datetime
.timedelta(123, 456, 1000)
1261 arr_us
= pa
.array(data
, type=us
)
1262 assert len(arr_us
) == 1
1263 assert arr_us
.type == us
1264 assert arr_us
[0].as_py() == datetime
.timedelta(123, 456, 1002)
1267 @pytest.mark
.xfail(not _pandas_api
.have_pandas
,
1268 reason
="pandas required for nanosecond conversion")
1269 def test_sequence_timestamp_nanoseconds():
1271 [datetime
.datetime(2007, 7, 13, 1, 23, 34, 123456)],
1272 [MyDatetime(2007, 7, 13, 1, 23, 34, 123456)]
1276 ns
= pa
.timestamp('ns')
1277 arr_ns
= pa
.array(data
, type=ns
)
1278 assert len(arr_ns
) == 1
1279 assert arr_ns
.type == ns
1280 assert arr_ns
[0].as_py() == datetime
.datetime(2007, 7, 13, 1,
1285 def test_sequence_timestamp_from_int_with_unit():
1286 # TODO(wesm): This test might be rewritten to assert the actual behavior
1287 # when pandas is not installed
1291 s
= pa
.timestamp('s')
1292 ms
= pa
.timestamp('ms')
1293 us
= pa
.timestamp('us')
1294 ns
= pa
.timestamp('ns')
1296 arr_s
= pa
.array(data
, type=s
)
1297 assert len(arr_s
) == 1
1298 assert arr_s
.type == s
1299 assert repr(arr_s
[0]) == (
1300 "<pyarrow.TimestampScalar: datetime.datetime(1970, 1, 1, 0, 0, 1)>"
1302 assert str(arr_s
[0]) == "1970-01-01 00:00:01"
1304 arr_ms
= pa
.array(data
, type=ms
)
1305 assert len(arr_ms
) == 1
1306 assert arr_ms
.type == ms
1307 assert repr(arr_ms
[0].as_py()) == (
1308 "datetime.datetime(1970, 1, 1, 0, 0, 0, 1000)"
1310 assert str(arr_ms
[0]) == "1970-01-01 00:00:00.001000"
1312 arr_us
= pa
.array(data
, type=us
)
1313 assert len(arr_us
) == 1
1314 assert arr_us
.type == us
1315 assert repr(arr_us
[0].as_py()) == (
1316 "datetime.datetime(1970, 1, 1, 0, 0, 0, 1)"
1318 assert str(arr_us
[0]) == "1970-01-01 00:00:00.000001"
1320 arr_ns
= pa
.array(data
, type=ns
)
1321 assert len(arr_ns
) == 1
1322 assert arr_ns
.type == ns
1323 assert repr(arr_ns
[0].as_py()) == (
1324 "Timestamp('1970-01-01 00:00:00.000000001')"
1326 assert str(arr_ns
[0]) == "1970-01-01 00:00:00.000000001"
1328 expected_exc
= TypeError
1330 class CustomClass():
1333 for ty
in [ns
, pa
.date32(), pa
.date64()]:
1334 with pytest
.raises(expected_exc
):
1335 pa
.array([1, CustomClass()], type=ty
)
1338 @pytest.mark
.parametrize('np_scalar', [True, False])
1339 def test_sequence_duration(np_scalar
):
1340 td1
= datetime
.timedelta(2, 3601, 1)
1341 td2
= datetime
.timedelta(1, 100, 1000)
1343 data
= [np
.timedelta64(td1
), None, np
.timedelta64(td2
)]
1345 data
= [td1
, None, td2
]
1347 arr
= pa
.array(data
)
1348 assert len(arr
) == 3
1349 assert arr
.type == pa
.duration('us')
1350 assert arr
.null_count
== 1
1351 assert arr
[0].as_py() == td1
1352 assert arr
[1].as_py() is None
1353 assert arr
[2].as_py() == td2
1356 @pytest.mark
.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1357 def test_sequence_duration_with_unit(unit
):
1359 datetime
.timedelta(3, 22, 1001),
1361 expected
= {'s': datetime
.timedelta(3, 22),
1362 'ms': datetime
.timedelta(3, 22, 1000),
1363 'us': datetime
.timedelta(3, 22, 1001),
1364 'ns': datetime
.timedelta(3, 22, 1001)}
1366 ty
= pa
.duration(unit
)
1368 arr_s
= pa
.array(data
, type=ty
)
1369 assert len(arr_s
) == 1
1370 assert arr_s
.type == ty
1371 assert arr_s
[0].as_py() == expected
[unit
]
1374 @pytest.mark
.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1375 def test_sequence_duration_from_int_with_unit(unit
):
1378 ty
= pa
.duration(unit
)
1379 arr
= pa
.array(data
, type=ty
)
1380 assert len(arr
) == 1
1381 assert arr
.type == ty
1382 assert arr
[0].value
== 5
1385 def test_sequence_duration_nested_lists():
1386 td1
= datetime
.timedelta(1, 1, 1000)
1387 td2
= datetime
.timedelta(1, 100)
1389 data
= [[td1
, None], [td1
, td2
]]
1391 arr
= pa
.array(data
)
1392 assert len(arr
) == 2
1393 assert arr
.type == pa
.list_(pa
.duration('us'))
1394 assert arr
.to_pylist() == data
1396 arr
= pa
.array(data
, type=pa
.list_(pa
.duration('ms')))
1397 assert len(arr
) == 2
1398 assert arr
.type == pa
.list_(pa
.duration('ms'))
1399 assert arr
.to_pylist() == data
1402 def test_sequence_duration_nested_lists_numpy():
1403 td1
= datetime
.timedelta(1, 1, 1000)
1404 td2
= datetime
.timedelta(1, 100)
1406 data
= [[np
.timedelta64(td1
), None],
1407 [np
.timedelta64(td1
), np
.timedelta64(td2
)]]
1409 arr
= pa
.array(data
)
1410 assert len(arr
) == 2
1411 assert arr
.type == pa
.list_(pa
.duration('us'))
1412 assert arr
.to_pylist() == [[td1
, None], [td1
, td2
]]
1414 data
= [np
.array([np
.timedelta64(td1
), None], dtype
='timedelta64[us]'),
1415 np
.array([np
.timedelta64(td1
), np
.timedelta64(td2
)])]
1417 arr
= pa
.array(data
)
1418 assert len(arr
) == 2
1419 assert arr
.type == pa
.list_(pa
.duration('us'))
1420 assert arr
.to_pylist() == [[td1
, None], [td1
, td2
]]
1423 def test_sequence_nesting_levels():
1425 arr
= pa
.array(data
)
1426 assert arr
.type == pa
.int64()
1427 assert arr
.to_pylist() == data
1429 data
= [[1], [2], None]
1430 arr
= pa
.array(data
)
1431 assert arr
.type == pa
.list_(pa
.int64())
1432 assert arr
.to_pylist() == data
1434 data
= [[1], [2, 3, 4], [None]]
1435 arr
= pa
.array(data
)
1436 assert arr
.type == pa
.list_(pa
.int64())
1437 assert arr
.to_pylist() == data
1439 data
= [None, [[None, 1]], [[2, 3, 4], None], [None]]
1440 arr
= pa
.array(data
)
1441 assert arr
.type == pa
.list_(pa
.list_(pa
.int64()))
1442 assert arr
.to_pylist() == data
1444 exceptions
= (pa
.ArrowInvalid
, pa
.ArrowTypeError
)
1446 # Mixed nesting levels are rejected
1447 with pytest
.raises(exceptions
):
1448 pa
.array([1, 2, [1]])
1450 with pytest
.raises(exceptions
):
1451 pa
.array([1, 2, []])
1453 with pytest
.raises(exceptions
):
1454 pa
.array([[1], [2], [None, [1]]])
1457 def test_sequence_mixed_types_fails():
1458 data
= ['a', 1, 2.0]
1459 with pytest
.raises(pa
.ArrowTypeError
):
1463 def test_sequence_mixed_types_with_specified_type_fails():
1464 data
= ['-10', '-5', {'a': 1}, '0', '5', '10']
1467 with pytest
.raises(TypeError):
1468 pa
.array(data
, type=type)
1471 def test_sequence_decimal():
1472 data
= [decimal
.Decimal('1234.183'), decimal
.Decimal('8094.234')]
1473 for type in [pa
.decimal128
, pa
.decimal256
]:
1474 arr
= pa
.array(data
, type=type(precision
=7, scale
=3))
1475 assert arr
.to_pylist() == data
1478 def test_sequence_decimal_different_precisions():
1480 decimal
.Decimal('1234234983.183'), decimal
.Decimal('80943244.234')
1482 for type in [pa
.decimal128
, pa
.decimal256
]:
1483 arr
= pa
.array(data
, type=type(precision
=13, scale
=3))
1484 assert arr
.to_pylist() == data
1487 def test_sequence_decimal_no_scale():
1488 data
= [decimal
.Decimal('1234234983'), decimal
.Decimal('8094324')]
1489 for type in [pa
.decimal128
, pa
.decimal256
]:
1490 arr
= pa
.array(data
, type=type(precision
=10))
1491 assert arr
.to_pylist() == data
1494 def test_sequence_decimal_negative():
1495 data
= [decimal
.Decimal('-1234.234983'), decimal
.Decimal('-8.094324')]
1496 for type in [pa
.decimal128
, pa
.decimal256
]:
1497 arr
= pa
.array(data
, type=type(precision
=10, scale
=6))
1498 assert arr
.to_pylist() == data
1501 def test_sequence_decimal_no_whole_part():
1502 data
= [decimal
.Decimal('-.4234983'), decimal
.Decimal('.0103943')]
1503 for type in [pa
.decimal128
, pa
.decimal256
]:
1504 arr
= pa
.array(data
, type=type(precision
=7, scale
=7))
1505 assert arr
.to_pylist() == data
1508 def test_sequence_decimal_large_integer():
1509 data
= [decimal
.Decimal('-394029506937548693.42983'),
1510 decimal
.Decimal('32358695912932.01033')]
1511 for type in [pa
.decimal128
, pa
.decimal256
]:
1512 arr
= pa
.array(data
, type=type(precision
=23, scale
=5))
1513 assert arr
.to_pylist() == data
1516 def test_sequence_decimal_from_integers():
1517 data
= [0, 1, -39402950693754869342983]
1518 expected
= [decimal
.Decimal(x
) for x
in data
]
1519 for type in [pa
.decimal128
, pa
.decimal256
]:
1520 arr
= pa
.array(data
, type=type(precision
=28, scale
=5))
1521 assert arr
.to_pylist() == expected
1524 def test_sequence_decimal_too_high_precision():
1525 # ARROW-6989 python decimal has too high precision
1526 with pytest
.raises(ValueError, match
="precision out of range"):
1527 pa
.array([decimal
.Decimal('1' * 80)])
1530 def test_sequence_decimal_infer():
1533 (decimal
.Decimal('1.234'), pa
.decimal128(4, 3)),
1535 (decimal
.Decimal('12300'), pa
.decimal128(5, 0)),
1536 (decimal
.Decimal('12300.0'), pa
.decimal128(6, 1)),
1537 # scientific power notation
1538 (decimal
.Decimal('1.23E+4'), pa
.decimal128(5, 0)),
1539 (decimal
.Decimal('123E+2'), pa
.decimal128(5, 0)),
1540 (decimal
.Decimal('123E+4'), pa
.decimal128(7, 0)),
1542 (decimal
.Decimal('0.0123'), pa
.decimal128(4, 4)),
1543 (decimal
.Decimal('0.01230'), pa
.decimal128(5, 5)),
1544 (decimal
.Decimal('1.230E-2'), pa
.decimal128(5, 5)),
1546 assert pa
.infer_type([data
]) == typ
1547 arr
= pa
.array([data
])
1548 assert arr
.type == typ
1549 assert arr
.to_pylist()[0] == data
1552 def test_sequence_decimal_infer_mixed():
1553 # ARROW-12150 - ensure mixed precision gets correctly inferred to
1554 # common type that can hold all input values
1556 ([decimal
.Decimal('1.234'), decimal
.Decimal('3.456')],
1557 pa
.decimal128(4, 3)),
1558 ([decimal
.Decimal('1.234'), decimal
.Decimal('456.7')],
1559 pa
.decimal128(6, 3)),
1560 ([decimal
.Decimal('123.4'), decimal
.Decimal('4.567')],
1561 pa
.decimal128(6, 3)),
1562 ([decimal
.Decimal('123e2'), decimal
.Decimal('4567e3')],
1563 pa
.decimal128(7, 0)),
1564 ([decimal
.Decimal('123e4'), decimal
.Decimal('4567e2')],
1565 pa
.decimal128(7, 0)),
1566 ([decimal
.Decimal('0.123'), decimal
.Decimal('0.04567')],
1567 pa
.decimal128(5, 5)),
1568 ([decimal
.Decimal('0.001'), decimal
.Decimal('1.01E5')],
1569 pa
.decimal128(9, 3)),
1571 for data
, typ
in cases
:
1572 assert pa
.infer_type(data
) == typ
1573 arr
= pa
.array(data
)
1574 assert arr
.type == typ
1575 assert arr
.to_pylist() == data
1578 def test_sequence_decimal_given_type():
1579 for data
, typs
, wrong_typs
in [
1582 decimal
.Decimal('1.234'),
1583 [pa
.decimal128(4, 3), pa
.decimal128(5, 3), pa
.decimal128(5, 4)],
1584 [pa
.decimal128(4, 2), pa
.decimal128(4, 4)]
1588 decimal
.Decimal('12300'),
1589 [pa
.decimal128(5, 0), pa
.decimal128(6, 0), pa
.decimal128(3, -2)],
1590 [pa
.decimal128(4, 0), pa
.decimal128(3, -3)]
1592 # scientific power notation
1594 decimal
.Decimal('1.23E+4'),
1595 [pa
.decimal128(5, 0), pa
.decimal128(6, 0), pa
.decimal128(3, -2)],
1596 [pa
.decimal128(4, 0), pa
.decimal128(3, -3)]
1600 arr
= pa
.array([data
], type=typ
)
1601 assert arr
.type == typ
1602 assert arr
.to_pylist()[0] == data
1603 for typ
in wrong_typs
:
1604 with pytest
.raises(ValueError):
1605 pa
.array([data
], type=typ
)
1608 def test_range_types():
1609 arr1
= pa
.array(range(3))
1610 arr2
= pa
.array((0, 1, 2))
1611 assert arr1
.equals(arr2
)
1614 def test_empty_range():
1615 arr
= pa
.array(range(0))
1616 assert len(arr
) == 0
1617 assert arr
.null_count
== 0
1618 assert arr
.type == pa
.null()
1619 assert arr
.to_pylist() == []
1622 def test_structarray():
1623 arr
= pa
.StructArray
.from_arrays([], names
=[])
1624 assert arr
.type == pa
.struct([])
1625 assert len(arr
) == 0
1626 assert arr
.to_pylist() == []
1628 ints
= pa
.array([None, 2, 3], type=pa
.int64())
1629 strs
= pa
.array(['a', None, 'c'], type=pa
.string())
1630 bools
= pa
.array([True, False, None], type=pa
.bool_())
1631 arr
= pa
.StructArray
.from_arrays(
1632 [ints
, strs
, bools
],
1633 ['ints', 'strs', 'bools'])
1636 {'ints': None, 'strs': 'a', 'bools': True},
1637 {'ints': 2, 'strs': None, 'bools': False},
1638 {'ints': 3, 'strs': 'c', 'bools': None},
1641 pylist
= arr
.to_pylist()
1642 assert pylist
== expected
, (pylist
, expected
)
1644 # len(names) != len(arrays)
1645 with pytest
.raises(ValueError):
1646 pa
.StructArray
.from_arrays([ints
], ['ints', 'strs'])
1649 def test_struct_from_dicts():
1650 ty
= pa
.struct([pa
.field('a', pa
.int32()),
1651 pa
.field('b', pa
.string()),
1652 pa
.field('c', pa
.bool_())])
1653 arr
= pa
.array([], type=ty
)
1654 assert arr
.to_pylist() == []
1656 data
= [{'a': 5, 'b': 'foo', 'c': True},
1657 {'a': 6, 'b': 'bar', 'c': False}]
1658 arr
= pa
.array(data
, type=ty
)
1659 assert arr
.to_pylist() == data
1661 # With omitted values
1662 data
= [{'a': 5, 'c': True},
1665 {'a': None, 'b': 'bar'}]
1666 arr
= pa
.array(data
, type=ty
)
1667 expected
= [{'a': 5, 'b': None, 'c': True},
1669 {'a': None, 'b': None, 'c': None},
1670 {'a': None, 'b': 'bar', 'c': None}]
1671 assert arr
.to_pylist() == expected
1674 def test_struct_from_dicts_bytes_keys():
1676 ty
= pa
.struct([pa
.field('a', pa
.int32()),
1677 pa
.field('b', pa
.string()),
1678 pa
.field('c', pa
.bool_())])
1679 arr
= pa
.array([], type=ty
)
1680 assert arr
.to_pylist() == []
1682 data
= [{b
'a': 5, b
'b': 'foo'},
1683 {b
'a': 6, b
'c': False}]
1684 arr
= pa
.array(data
, type=ty
)
1685 assert arr
.to_pylist() == [
1686 {'a': 5, 'b': 'foo', 'c': None},
1687 {'a': 6, 'b': None, 'c': False},
1691 def test_struct_from_tuples():
1692 ty
= pa
.struct([pa
.field('a', pa
.int32()),
1693 pa
.field('b', pa
.string()),
1694 pa
.field('c', pa
.bool_())])
1696 data
= [(5, 'foo', True),
1698 expected
= [{'a': 5, 'b': 'foo', 'c': True},
1699 {'a': 6, 'b': 'bar', 'c': False}]
1700 arr
= pa
.array(data
, type=ty
)
1702 data_as_ndarray
= np
.empty(len(data
), dtype
=object)
1703 data_as_ndarray
[:] = data
1704 arr2
= pa
.array(data_as_ndarray
, type=ty
)
1705 assert arr
.to_pylist() == expected
1707 assert arr
.equals(arr2
)
1709 # With omitted values
1710 data
= [(5, 'foo', None),
1713 expected
= [{'a': 5, 'b': 'foo', 'c': None},
1715 {'a': 6, 'b': None, 'c': False}]
1716 arr
= pa
.array(data
, type=ty
)
1717 assert arr
.to_pylist() == expected
1719 # Invalid tuple size
1720 for tup
in [(5, 'foo'), (), ('5', 'foo', True, None)]:
1721 with pytest
.raises(ValueError, match
="(?i)tuple size"):
1722 pa
.array([tup
], type=ty
)
1725 def test_struct_from_list_of_pairs():
1727 pa
.field('a', pa
.int32()),
1728 pa
.field('b', pa
.string()),
1729 pa
.field('c', pa
.bool_())
1732 [('a', 5), ('b', 'foo'), ('c', True)],
1733 [('a', 6), ('b', 'bar'), ('c', False)],
1736 arr
= pa
.array(data
, type=ty
)
1737 assert arr
.to_pylist() == [
1738 {'a': 5, 'b': 'foo', 'c': True},
1739 {'a': 6, 'b': 'bar', 'c': False},
1743 # test with duplicated field names
1745 pa
.field('a', pa
.int32()),
1746 pa
.field('a', pa
.string()),
1747 pa
.field('b', pa
.bool_())
1750 [('a', 5), ('a', 'foo'), ('b', True)],
1751 [('a', 6), ('a', 'bar'), ('b', False)],
1753 arr
= pa
.array(data
, type=ty
)
1754 with pytest
.raises(ValueError):
1755 # TODO(kszucs): ARROW-9997
1758 # test with empty elements
1760 pa
.field('a', pa
.int32()),
1761 pa
.field('b', pa
.string()),
1762 pa
.field('c', pa
.bool_())
1766 [('a', 5), ('b', 'foo'), ('c', True)],
1767 [('a', 2), ('b', 'baz')],
1768 [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')],
1771 {'a': None, 'b': None, 'c': None},
1772 {'a': 5, 'b': 'foo', 'c': True},
1773 {'a': 2, 'b': 'baz', 'c': None},
1774 {'a': 1, 'b': 'bar', 'c': False},
1776 arr
= pa
.array(data
, type=ty
)
1777 assert arr
.to_pylist() == expected
1780 def test_struct_from_list_of_pairs_errors():
1782 pa
.field('a', pa
.int32()),
1783 pa
.field('b', pa
.string()),
1784 pa
.field('c', pa
.bool_())
1787 # test that it raises if the key doesn't match the expected field name
1790 [('a', 5), ('c', True), ('b', None)],
1792 msg
= "The expected field name is `b` but `c` was given"
1793 with pytest
.raises(ValueError, match
=msg
):
1794 pa
.array(data
, type=ty
)
1796 # test various errors both at the first position and after because of key
1799 r
"Could not convert {} with type {}: was expecting tuple of "
1800 r
"(key, value) pair"
1803 tuple(), # empty key-value pair
1804 tuple('a',), # missing value
1805 tuple('unknown-key',), # not known field name
1806 'string', # not a tuple
1808 for key_value_pair
in cases
:
1809 msg
= re
.escape(template
.format(
1810 repr(key_value_pair
), type(key_value_pair
).__name
__
1813 with pytest
.raises(TypeError, match
=msg
):
1816 [('a', 5), ('b', 'foo'), ('c', None)],
1819 with pytest
.raises(TypeError, match
=msg
):
1821 [('a', 5), ('b', 'foo'), ('c', None)],
1826 def test_struct_from_mixed_sequence():
1827 # It is forbidden to mix dicts and tuples when initializing a struct array
1828 ty
= pa
.struct([pa
.field('a', pa
.int32()),
1829 pa
.field('b', pa
.string()),
1830 pa
.field('c', pa
.bool_())])
1831 data
= [(5, 'foo', True),
1832 {'a': 6, 'b': 'bar', 'c': False}]
1833 with pytest
.raises(TypeError):
1834 pa
.array(data
, type=ty
)
1837 def test_struct_from_dicts_inference():
1838 expected_type
= pa
.struct([pa
.field('a', pa
.int64()),
1839 pa
.field('b', pa
.string()),
1840 pa
.field('c', pa
.bool_())])
1841 data
= [{'a': 5, 'b': 'foo', 'c': True},
1842 {'a': 6, 'b': 'bar', 'c': False}]
1844 arr
= pa
.array(data
)
1845 check_struct_type(arr
.type, expected_type
)
1846 assert arr
.to_pylist() == data
1848 # With omitted values
1849 data
= [{'a': 5, 'c': True},
1852 {'a': None, 'b': 'bar'}]
1853 expected
= [{'a': 5, 'b': None, 'c': True},
1855 {'a': None, 'b': None, 'c': None},
1856 {'a': None, 'b': 'bar', 'c': None}]
1858 arr
= pa
.array(data
)
1859 data_as_ndarray
= np
.empty(len(data
), dtype
=object)
1860 data_as_ndarray
[:] = data
1861 arr2
= pa
.array(data
)
1863 check_struct_type(arr
.type, expected_type
)
1864 assert arr
.to_pylist() == expected
1865 assert arr
.equals(arr2
)
1868 expected_type
= pa
.struct([
1869 pa
.field('a', pa
.struct([pa
.field('aa', pa
.list_(pa
.int64())),
1870 pa
.field('ab', pa
.bool_())])),
1871 pa
.field('b', pa
.string())])
1872 data
= [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
1873 {'a': {'aa': None, 'ab': False}, 'b': None},
1874 {'a': None, 'b': 'bar'}]
1875 arr
= pa
.array(data
)
1877 assert arr
.to_pylist() == data
1880 arr
= pa
.array([{}])
1881 assert arr
.type == pa
.struct([])
1882 assert arr
.to_pylist() == [{}]
1884 # Mixing structs and scalars is rejected
1885 with pytest
.raises((pa
.ArrowInvalid
, pa
.ArrowTypeError
)):
1886 pa
.array([1, {'a': 2}])
1889 def test_structarray_from_arrays_coerce():
1892 strs
= ['a', None, 'c']
1893 bools
= [True, False, None]
1894 ints_nonnull
= [1, 2, 3]
1896 arrays
= [ints
, strs
, bools
, ints_nonnull
]
1897 result
= pa
.StructArray
.from_arrays(arrays
,
1898 ['ints', 'strs', 'bools',
1900 expected
= pa
.StructArray
.from_arrays(
1901 [pa
.array(ints
, type='int64'),
1902 pa
.array(strs
, type='utf8'),
1904 pa
.array(ints_nonnull
, type='int64')],
1905 ['ints', 'strs', 'bools', 'int_nonnull'])
1907 with pytest
.raises(ValueError):
1908 pa
.StructArray
.from_arrays(arrays
)
1910 assert result
.equals(expected
)
1913 def test_decimal_array_with_none_and_nan():
1914 values
= [decimal
.Decimal('1.234'), None, np
.nan
, decimal
.Decimal('nan')]
1916 with pytest
.raises(TypeError):
1917 # ARROW-6227: Without from_pandas=True, NaN is considered a float
1918 array
= pa
.array(values
)
1920 array
= pa
.array(values
, from_pandas
=True)
1921 assert array
.type == pa
.decimal128(4, 3)
1922 assert array
.to_pylist() == values
[:2] + [None, None]
1924 array
= pa
.array(values
, type=pa
.decimal128(10, 4), from_pandas
=True)
1925 assert array
.to_pylist() == [decimal
.Decimal('1.2340'), None, None, None]
1928 def test_map_from_dicts():
1929 data
= [[{'key': b
'a', 'value': 1}, {'key': b
'b', 'value': 2}],
1930 [{'key': b
'c', 'value': 3}],
1931 [{'key': b
'd', 'value': 4}, {'key': b
'e', 'value': 5},
1932 {'key': b
'f', 'value': None}],
1933 [{'key': b
'g', 'value': 7}]]
1934 expected
= [[(d
['key'], d
['value']) for d
in entry
] for entry
in data
]
1936 arr
= pa
.array(expected
, type=pa
.map_(pa
.binary(), pa
.int32()))
1938 assert arr
.to_pylist() == expected
1940 # With omitted values
1944 arr
= pa
.array(expected
, type=pa
.map_(pa
.binary(), pa
.int32()))
1946 assert arr
.to_pylist() == expected
1948 # Invalid dictionary
1949 for entry
in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]:
1950 with pytest
.raises(ValueError, match
="Invalid Map"):
1951 pa
.array([entry
], type=pa
.map_('i4', 'i4'))
1953 # Invalid dictionary types
1954 for entry
in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
1955 with pytest
.raises(pa
.ArrowInvalid
, match
="tried to convert to int"):
1956 pa
.array([entry
], type=pa
.map_('i4', 'i4'))
1959 def test_map_from_tuples():
1960 expected
= [[(b
'a', 1), (b
'b', 2)],
1962 [(b
'd', 4), (b
'e', 5), (b
'f', None)],
1965 arr
= pa
.array(expected
, type=pa
.map_(pa
.binary(), pa
.int32()))
1967 assert arr
.to_pylist() == expected
1969 # With omitted values
1972 arr
= pa
.array(expected
, type=pa
.map_(pa
.binary(), pa
.int32()))
1974 assert arr
.to_pylist() == expected
1976 # Invalid tuple size
1977 for entry
in [[(5,)], [()], [('5', 'foo', True)]]:
1978 with pytest
.raises(ValueError, match
="(?i)tuple size"):
1979 pa
.array([entry
], type=pa
.map_('i4', 'i4'))
1982 def test_dictionary_from_boolean():
1983 typ
= pa
.dictionary(pa
.int8(), value_type
=pa
.bool_())
1984 a
= pa
.array([False, False, True, False, True], type=typ
)
1985 assert isinstance(a
.type, pa
.DictionaryType
)
1986 assert a
.type.equals(typ
)
1988 expected_indices
= pa
.array([0, 0, 1, 0, 1], type=pa
.int8())
1989 expected_dictionary
= pa
.array([False, True], type=pa
.bool_())
1990 assert a
.indices
.equals(expected_indices
)
1991 assert a
.dictionary
.equals(expected_dictionary
)
1994 @pytest.mark
.parametrize('value_type', [
2006 def test_dictionary_from_integers(value_type
):
2007 typ
= pa
.dictionary(pa
.int8(), value_type
=value_type
)
2008 a
= pa
.array([1, 2, 1, 1, 2, 3], type=typ
)
2009 assert isinstance(a
.type, pa
.DictionaryType
)
2010 assert a
.type.equals(typ
)
2012 expected_indices
= pa
.array([0, 1, 0, 0, 1, 2], type=pa
.int8())
2013 expected_dictionary
= pa
.array([1, 2, 3], type=value_type
)
2014 assert a
.indices
.equals(expected_indices
)
2015 assert a
.dictionary
.equals(expected_dictionary
)
2018 @pytest.mark
.parametrize('input_index_type', [
2024 def test_dictionary_index_type(input_index_type
):
2025 # dictionary array is constructed using adaptive index type builder,
2026 # but the input index type is considered as the minimal width type to use
2028 typ
= pa
.dictionary(input_index_type
, value_type
=pa
.int64())
2029 arr
= pa
.array(range(10), type=typ
)
2030 assert arr
.type.equals(typ
)
2033 def test_dictionary_is_always_adaptive():
2034 # dictionary array is constructed using adaptive index type builder,
2035 # meaning that the output index type may be wider than the given index type
2036 # since it depends on the input data
2037 typ
= pa
.dictionary(pa
.int8(), value_type
=pa
.int64())
2039 a
= pa
.array(range(2**7), type=typ
)
2040 expected
= pa
.dictionary(pa
.int8(), pa
.int64())
2041 assert a
.type.equals(expected
)
2043 a
= pa
.array(range(2**7 + 1), type=typ
)
2044 expected
= pa
.dictionary(pa
.int16(), pa
.int64())
2045 assert a
.type.equals(expected
)
2048 def test_dictionary_from_strings():
2049 for value_type
in [pa
.binary(), pa
.string()]:
2050 typ
= pa
.dictionary(pa
.int8(), value_type
)
2051 a
= pa
.array(["", "a", "bb", "a", "bb", "ccc"], type=typ
)
2053 assert isinstance(a
.type, pa
.DictionaryType
)
2055 expected_indices
= pa
.array([0, 1, 2, 1, 2, 3], type=pa
.int8())
2056 expected_dictionary
= pa
.array(["", "a", "bb", "ccc"], type=value_type
)
2057 assert a
.indices
.equals(expected_indices
)
2058 assert a
.dictionary
.equals(expected_dictionary
)
2060 # fixed size binary type
2061 typ
= pa
.dictionary(pa
.int8(), pa
.binary(3))
2062 a
= pa
.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ
)
2063 assert isinstance(a
.type, pa
.DictionaryType
)
2065 expected_indices
= pa
.array([0, 0, 1, 2, 1], type=pa
.int8())
2066 expected_dictionary
= pa
.array(["aaa", "bbb", "ccc"], type=pa
.binary(3))
2067 assert a
.indices
.equals(expected_indices
)
2068 assert a
.dictionary
.equals(expected_dictionary
)
2071 @pytest.mark
.parametrize(('unit', 'expected'), [
2072 ('s', datetime
.timedelta(seconds
=-2147483000)),
2073 ('ms', datetime
.timedelta(milliseconds
=-2147483000)),
2074 ('us', datetime
.timedelta(microseconds
=-2147483000)),
2075 ('ns', datetime
.timedelta(microseconds
=-2147483))
2077 def test_duration_array_roundtrip_corner_cases(unit
, expected
):
2078 # Corner case discovered by hypothesis: there were implicit conversions to
2079 # unsigned values resulting wrong values with wrong signs.
2080 ty
= pa
.duration(unit
)
2081 arr
= pa
.array([-2147483000], type=ty
)
2082 restored
= pa
.array(arr
.to_pylist(), type=ty
)
2083 assert arr
.equals(restored
)
2085 expected_list
= [expected
]
2087 # if pandas is available then a pandas Timedelta is returned
2093 expected_list
= [pd
.Timedelta(-2147483000, unit
='ns')]
2095 assert restored
.to_pylist() == expected_list
2099 def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
2100 # corner case discovered by hypothesis: preserving the nanoseconds on
2101 # conversion from a list of Timedelta and Timestamp objects
2104 ty
= pa
.duration('ns')
2105 arr
= pa
.array([9223371273709551616], type=ty
)
2106 data
= arr
.to_pylist()
2107 assert isinstance(data
[0], pd
.Timedelta
)
2108 restored
= pa
.array(data
, type=ty
)
2109 assert arr
.equals(restored
)
2110 assert restored
.to_pylist() == [
2111 pd
.Timedelta(9223371273709551616, unit
='ns')
2114 ty
= pa
.timestamp('ns')
2115 arr
= pa
.array([9223371273709551616], type=ty
)
2116 data
= arr
.to_pylist()
2117 assert isinstance(data
[0], pd
.Timestamp
)
2118 restored
= pa
.array(data
, type=ty
)
2119 assert arr
.equals(restored
)
2120 assert restored
.to_pylist() == [
2121 pd
.Timestamp(9223371273709551616, unit
='ns')
2124 ty
= pa
.timestamp('ns', tz
='US/Eastern')
2125 value
= 1604119893000000000
2126 arr
= pa
.array([value
], type=ty
)
2127 data
= arr
.to_pylist()
2128 assert isinstance(data
[0], pd
.Timestamp
)
2129 restored
= pa
.array(data
, type=ty
)
2130 assert arr
.equals(restored
)
2131 assert restored
.to_pylist() == [
2132 pd
.Timestamp(value
, unit
='ns').tz_localize(
2133 "UTC").tz_convert('US/Eastern')
2137 @h.given(past
.all_arrays
)
2138 def test_array_to_pylist_roundtrip(arr
):
2139 seq
= arr
.to_pylist()
2140 restored
= pa
.array(seq
, type=arr
.type)
2141 assert restored
.equals(arr
)
2144 @pytest.mark
.large_memory
2145 def test_auto_chunking_binary_like():
2147 v1
= b
'x' * 100000000
2148 v2
= b
'x' * 147483646
2151 one_chunk_data
= [v1
] * 20 + [b
'', None, v2
]
2152 arr
= pa
.array(one_chunk_data
, type=pa
.binary())
2153 assert isinstance(arr
, pa
.Array
)
2154 assert len(arr
) == 23
2155 assert arr
[20].as_py() == b
''
2156 assert arr
[21].as_py() is None
2157 assert arr
[22].as_py() == v2
2160 two_chunk_data
= one_chunk_data
+ [b
'two']
2161 arr
= pa
.array(two_chunk_data
, type=pa
.binary())
2162 assert isinstance(arr
, pa
.ChunkedArray
)
2163 assert arr
.num_chunks
== 2
2164 assert len(arr
.chunk(0)) == 23
2165 assert len(arr
.chunk(1)) == 1
2166 assert arr
.chunk(0)[20].as_py() == b
''
2167 assert arr
.chunk(0)[21].as_py() is None
2168 assert arr
.chunk(0)[22].as_py() == v2
2169 assert arr
.chunk(1).to_pylist() == [b
'two']
2172 three_chunk_data
= one_chunk_data
* 2 + [b
'three', b
'three']
2173 arr
= pa
.array(three_chunk_data
, type=pa
.binary())
2174 assert isinstance(arr
, pa
.ChunkedArray
)
2175 assert arr
.num_chunks
== 3
2176 assert len(arr
.chunk(0)) == 23
2177 assert len(arr
.chunk(1)) == 23
2178 assert len(arr
.chunk(2)) == 2
2180 assert arr
.chunk(i
)[20].as_py() == b
''
2181 assert arr
.chunk(i
)[21].as_py() is None
2182 assert arr
.chunk(i
)[22].as_py() == v2
2183 assert arr
.chunk(2).to_pylist() == [b
'three', b
'three']
2186 @pytest.mark
.large_memory
2187 def test_auto_chunking_list_of_binary():
2189 vals
= [['x' * 1024]] * ((2 << 20) + 1)
2190 arr
= pa
.array(vals
)
2191 assert isinstance(arr
, pa
.ChunkedArray
)
2192 assert arr
.num_chunks
== 2
2193 assert len(arr
.chunk(0)) == 2**21 - 1
2194 assert len(arr
.chunk(1)) == 2
2195 assert arr
.chunk(1).to_pylist() == [['x' * 1024]] * 2
2198 @pytest.mark
.large_memory
2199 def test_auto_chunking_list_like():
2200 item
= np
.ones((2**28,), dtype
='uint8')
2201 data
= [item
] * (2**3 - 1)
2202 arr
= pa
.array(data
, type=pa
.list_(pa
.uint8()))
2203 assert isinstance(arr
, pa
.Array
)
2204 assert len(arr
) == 7
2206 item
= np
.ones((2**28,), dtype
='uint8')
2207 data
= [item
] * 2**3
2208 arr
= pa
.array(data
, type=pa
.list_(pa
.uint8()))
2209 assert isinstance(arr
, pa
.ChunkedArray
)
2210 assert arr
.num_chunks
== 2
2211 assert len(arr
.chunk(0)) == 7
2212 assert len(arr
.chunk(1)) == 1
2213 chunk
= arr
.chunk(1)
2215 assert isinstance(scalar
, pa
.ListScalar
)
2216 expected
= pa
.array(item
, type=pa
.uint8())
2217 assert scalar
.values
== expected
2221 @pytest.mark
.large_memory
2222 def test_auto_chunking_map_type():
2223 # takes ~20 minutes locally
2224 ty
= pa
.map_(pa
.int8(), pa
.int8())
2225 item
= [(1, 1)] * 2**28
2226 data
= [item
] * 2**3
2227 arr
= pa
.array(data
, type=ty
)
2228 assert isinstance(arr
, pa
.ChunkedArray
)
2229 assert len(arr
.chunk(0)) == 7
2230 assert len(arr
.chunk(1)) == 1
2233 @pytest.mark
.large_memory
2234 @pytest.mark
.parametrize(('ty', 'char'), [
2236 (pa
.binary(), b
'x'),
2238 def test_nested_auto_chunking(ty
, char
):
2239 v1
= char
* 100000000
2240 v2
= char
* 147483646
2242 struct_type
= pa
.struct([
2243 pa
.field('bool', pa
.bool_()),
2244 pa
.field('integer', pa
.int64()),
2245 pa
.field('string-like', ty
),
2248 data
= [{'bool': True, 'integer': 1, 'string-like': v1
}] * 20
2249 data
.append({'bool': True, 'integer': 1, 'string-like': v2
})
2250 arr
= pa
.array(data
, type=struct_type
)
2251 assert isinstance(arr
, pa
.Array
)
2253 data
.append({'bool': True, 'integer': 1, 'string-like': char
})
2254 arr
= pa
.array(data
, type=struct_type
)
2255 assert isinstance(arr
, pa
.ChunkedArray
)
2256 assert arr
.num_chunks
== 2
2257 assert len(arr
.chunk(0)) == 21
2258 assert len(arr
.chunk(1)) == 1
2259 assert arr
.chunk(1)[0].as_py() == {
2266 @pytest.mark
.large_memory
2267 def test_array_from_pylist_data_overflow():
2268 # Regression test for ARROW-12983
2269 # Data buffer overflow - should result in chunked array
2270 items
= [b
'a' * 4096] * (2 ** 19)
2271 arr
= pa
.array(items
, type=pa
.string())
2272 assert isinstance(arr
, pa
.ChunkedArray
)
2273 assert len(arr
) == 2**19
2274 assert len(arr
.chunks
) > 1
2276 mask
= np
.zeros(2**19, bool)
2277 arr
= pa
.array(items
, mask
=mask
, type=pa
.string())
2278 assert isinstance(arr
, pa
.ChunkedArray
)
2279 assert len(arr
) == 2**19
2280 assert len(arr
.chunks
) > 1
2282 arr
= pa
.array(items
, type=pa
.binary())
2283 assert isinstance(arr
, pa
.ChunkedArray
)
2284 assert len(arr
) == 2**19
2285 assert len(arr
.chunks
) > 1
2289 @pytest.mark
.large_memory
2290 def test_array_from_pylist_offset_overflow():
2291 # Regression test for ARROW-12983
2292 # Offset buffer overflow - should result in chunked array
2293 # Note this doesn't apply to primitive arrays
2294 items
= [b
'a'] * (2 ** 31)
2295 arr
= pa
.array(items
, type=pa
.string())
2296 assert isinstance(arr
, pa
.ChunkedArray
)
2297 assert len(arr
) == 2**31
2298 assert len(arr
.chunks
) > 1
2300 mask
= np
.zeros(2**31, bool)
2301 arr
= pa
.array(items
, mask
=mask
, type=pa
.string())
2302 assert isinstance(arr
, pa
.ChunkedArray
)
2303 assert len(arr
) == 2**31
2304 assert len(arr
.chunks
) > 1
2306 arr
= pa
.array(items
, type=pa
.binary())
2307 assert isinstance(arr
, pa
.ChunkedArray
)
2308 assert len(arr
) == 2**31
2309 assert len(arr
.chunks
) > 1