]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/python/pyarrow/tests/test_convert_builtin.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / tests / test_convert_builtin.py
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 import collections
19 import datetime
20 import decimal
21 import itertools
22 import math
23 import re
24
25 import hypothesis as h
26 import numpy as np
27 import pytz
28 import pytest
29
30 from pyarrow.pandas_compat import _pandas_api # noqa
31 import pyarrow as pa
32 import pyarrow.tests.strategies as past
33
34
35 int_type_pairs = [
36 (np.int8, pa.int8()),
37 (np.int16, pa.int16()),
38 (np.int32, pa.int32()),
39 (np.int64, pa.int64()),
40 (np.uint8, pa.uint8()),
41 (np.uint16, pa.uint16()),
42 (np.uint32, pa.uint32()),
43 (np.uint64, pa.uint64())]
44
45
46 np_int_types, pa_int_types = zip(*int_type_pairs)
47
48
49 class StrangeIterable:
50 def __init__(self, lst):
51 self.lst = lst
52
53 def __iter__(self):
54 return self.lst.__iter__()
55
56
57 class MyInt:
58 def __init__(self, value):
59 self.value = value
60
61 def __int__(self):
62 return self.value
63
64
65 class MyBrokenInt:
66 def __int__(self):
67 1/0 # MARKER
68
69
70 def check_struct_type(ty, expected):
71 """
72 Check a struct type is as expected, but not taking order into account.
73 """
74 assert pa.types.is_struct(ty)
75 assert set(ty) == set(expected)
76
77
78 def test_iterable_types():
79 arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
80 arr2 = pa.array((0, 1, 2, 3))
81
82 assert arr1.equals(arr2)
83
84
85 def test_empty_iterable():
86 arr = pa.array(StrangeIterable([]))
87 assert len(arr) == 0
88 assert arr.null_count == 0
89 assert arr.type == pa.null()
90 assert arr.to_pylist() == []
91
92
93 def test_limited_iterator_types():
94 arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
95 arr2 = pa.array((0, 1, 2))
96 assert arr1.equals(arr2)
97
98
99 def test_limited_iterator_size_overflow():
100 arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
101 arr2 = pa.array((0, 1))
102 assert arr1.equals(arr2)
103
104
105 def test_limited_iterator_size_underflow():
106 arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
107 arr2 = pa.array((0, 1, 2))
108 assert arr1.equals(arr2)
109
110
111 def test_iterator_without_size():
112 expected = pa.array((0, 1, 2))
113 arr1 = pa.array(iter(range(3)))
114 assert arr1.equals(expected)
115 # Same with explicit type
116 arr1 = pa.array(iter(range(3)), type=pa.int64())
117 assert arr1.equals(expected)
118
119
120 def test_infinite_iterator():
121 expected = pa.array((0, 1, 2))
122 arr1 = pa.array(itertools.count(0), size=3)
123 assert arr1.equals(expected)
124 # Same with explicit type
125 arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3)
126 assert arr1.equals(expected)
127
128
129 def _as_list(xs):
130 return xs
131
132
133 def _as_tuple(xs):
134 return tuple(xs)
135
136
137 def _as_deque(xs):
138 # deque is a sequence while neither tuple nor list
139 return collections.deque(xs)
140
141
142 def _as_dict_values(xs):
143 # a dict values object is not a sequence, just a regular iterable
144 dct = {k: v for k, v in enumerate(xs)}
145 return dct.values()
146
147
148 def _as_numpy_array(xs):
149 arr = np.empty(len(xs), dtype=object)
150 arr[:] = xs
151 return arr
152
153
154 def _as_set(xs):
155 return set(xs)
156
157
158 SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
159 ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
160 COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES
161
162 parametrize_with_iterable_types = pytest.mark.parametrize(
163 "seq", ITERABLE_TYPES
164 )
165
166 parametrize_with_sequence_types = pytest.mark.parametrize(
167 "seq", SEQUENCE_TYPES
168 )
169
170 parametrize_with_collections_types = pytest.mark.parametrize(
171 "seq", COLLECTIONS_TYPES
172 )
173
174
175 @parametrize_with_collections_types
176 def test_sequence_types(seq):
177 arr1 = pa.array(seq([1, 2, 3]))
178 arr2 = pa.array([1, 2, 3])
179
180 assert arr1.equals(arr2)
181
182
183 @parametrize_with_iterable_types
184 def test_nested_sequence_types(seq):
185 arr1 = pa.array([seq([1, 2, 3])])
186 arr2 = pa.array([[1, 2, 3]])
187
188 assert arr1.equals(arr2)
189
190
191 @parametrize_with_sequence_types
192 def test_sequence_boolean(seq):
193 expected = [True, None, False, None]
194 arr = pa.array(seq(expected))
195 assert len(arr) == 4
196 assert arr.null_count == 2
197 assert arr.type == pa.bool_()
198 assert arr.to_pylist() == expected
199
200
201 @parametrize_with_sequence_types
202 def test_sequence_numpy_boolean(seq):
203 expected = [np.bool_(True), None, np.bool_(False), None]
204 arr = pa.array(seq(expected))
205 assert arr.type == pa.bool_()
206 assert arr.to_pylist() == [True, None, False, None]
207
208
209 @parametrize_with_sequence_types
210 def test_sequence_mixed_numpy_python_bools(seq):
211 values = np.array([True, False])
212 arr = pa.array(seq([values[0], None, values[1], True, False]))
213 assert arr.type == pa.bool_()
214 assert arr.to_pylist() == [True, None, False, True, False]
215
216
217 @parametrize_with_collections_types
218 def test_empty_list(seq):
219 arr = pa.array(seq([]))
220 assert len(arr) == 0
221 assert arr.null_count == 0
222 assert arr.type == pa.null()
223 assert arr.to_pylist() == []
224
225
226 @parametrize_with_sequence_types
227 def test_nested_lists(seq):
228 data = [[], [1, 2], None]
229 arr = pa.array(seq(data))
230 assert len(arr) == 3
231 assert arr.null_count == 1
232 assert arr.type == pa.list_(pa.int64())
233 assert arr.to_pylist() == data
234 # With explicit type
235 arr = pa.array(seq(data), type=pa.list_(pa.int32()))
236 assert len(arr) == 3
237 assert arr.null_count == 1
238 assert arr.type == pa.list_(pa.int32())
239 assert arr.to_pylist() == data
240
241
242 @parametrize_with_sequence_types
243 def test_nested_large_lists(seq):
244 data = [[], [1, 2], None]
245 arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
246 assert len(arr) == 3
247 assert arr.null_count == 1
248 assert arr.type == pa.large_list(pa.int16())
249 assert arr.to_pylist() == data
250
251
252 @parametrize_with_collections_types
253 def test_list_with_non_list(seq):
254 # List types don't accept non-sequences
255 with pytest.raises(TypeError):
256 pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
257 with pytest.raises(TypeError):
258 pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
259
260
261 @parametrize_with_sequence_types
262 def test_nested_arrays(seq):
263 arr = pa.array(seq([np.array([], dtype=np.int64),
264 np.array([1, 2], dtype=np.int64), None]))
265 assert len(arr) == 3
266 assert arr.null_count == 1
267 assert arr.type == pa.list_(pa.int64())
268 assert arr.to_pylist() == [[], [1, 2], None]
269
270
271 @parametrize_with_sequence_types
272 def test_nested_fixed_size_list(seq):
273 # sequence of lists
274 data = [[1, 2], [3, None], None]
275 arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2))
276 assert len(arr) == 3
277 assert arr.null_count == 1
278 assert arr.type == pa.list_(pa.int64(), 2)
279 assert arr.to_pylist() == data
280
281 # sequence of numpy arrays
282 data = [np.array([1, 2], dtype='int64'), np.array([3, 4], dtype='int64'),
283 None]
284 arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2))
285 assert len(arr) == 3
286 assert arr.null_count == 1
287 assert arr.type == pa.list_(pa.int64(), 2)
288 assert arr.to_pylist() == [[1, 2], [3, 4], None]
289
290 # incorrect length of the lists or arrays
291 data = [[1, 2, 4], [3, None], None]
292 for data in [[[1, 2, 3]], [np.array([1, 2, 4], dtype='int64')]]:
293 with pytest.raises(
294 ValueError, match="Length of item not correct: expected 2"):
295 pa.array(seq(data), type=pa.list_(pa.int64(), 2))
296
297 # with list size of 0
298 data = [[], [], None]
299 arr = pa.array(seq(data), type=pa.list_(pa.int64(), 0))
300 assert len(arr) == 3
301 assert arr.null_count == 1
302 assert arr.type == pa.list_(pa.int64(), 0)
303 assert arr.to_pylist() == [[], [], None]
304
305
306 @parametrize_with_sequence_types
307 def test_sequence_all_none(seq):
308 arr = pa.array(seq([None, None]))
309 assert len(arr) == 2
310 assert arr.null_count == 2
311 assert arr.type == pa.null()
312 assert arr.to_pylist() == [None, None]
313
314
315 @parametrize_with_sequence_types
316 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
317 def test_sequence_integer(seq, np_scalar_pa_type):
318 np_scalar, pa_type = np_scalar_pa_type
319 expected = [1, None, 3, None,
320 np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
321 arr = pa.array(seq(expected), type=pa_type)
322 assert len(arr) == 6
323 assert arr.null_count == 2
324 assert arr.type == pa_type
325 assert arr.to_pylist() == expected
326
327
328 @parametrize_with_collections_types
329 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
330 def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
331 # ARROW-2806: numpy.nan is a double value and thus should produce
332 # a double array.
333 _, pa_type = np_scalar_pa_type
334 with pytest.raises(ValueError):
335 pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
336
337 arr = pa.array(seq([np.nan]), type=pa_type, from_pandas=True)
338 expected = [None]
339 assert len(arr) == 1
340 assert arr.null_count == 1
341 assert arr.type == pa_type
342 assert arr.to_pylist() == expected
343
344
345 @parametrize_with_sequence_types
346 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
347 def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
348 # ARROW-2806: numpy.nan is a double value and thus should produce
349 # a double array.
350 _, pa_type = np_scalar_pa_type
351 with pytest.raises(ValueError):
352 pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
353
354 arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True)
355 expected = [[None]]
356 assert len(arr) == 1
357 assert arr.null_count == 0
358 assert arr.type == pa.list_(pa_type)
359 assert arr.to_pylist() == expected
360
361
362 @parametrize_with_sequence_types
363 def test_sequence_integer_inferred(seq):
364 expected = [1, None, 3, None]
365 arr = pa.array(seq(expected))
366 assert len(arr) == 4
367 assert arr.null_count == 2
368 assert arr.type == pa.int64()
369 assert arr.to_pylist() == expected
370
371
372 @parametrize_with_sequence_types
373 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
374 def test_sequence_numpy_integer(seq, np_scalar_pa_type):
375 np_scalar, pa_type = np_scalar_pa_type
376 expected = [np_scalar(1), None, np_scalar(3), None,
377 np_scalar(np.iinfo(np_scalar).min),
378 np_scalar(np.iinfo(np_scalar).max)]
379 arr = pa.array(seq(expected), type=pa_type)
380 assert len(arr) == 6
381 assert arr.null_count == 2
382 assert arr.type == pa_type
383 assert arr.to_pylist() == expected
384
385
386 @parametrize_with_sequence_types
387 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
388 def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
389 np_scalar, pa_type = np_scalar_pa_type
390 expected = [np_scalar(1), None, np_scalar(3), None]
391 expected += [np_scalar(np.iinfo(np_scalar).min),
392 np_scalar(np.iinfo(np_scalar).max)]
393 arr = pa.array(seq(expected))
394 assert len(arr) == 6
395 assert arr.null_count == 2
396 assert arr.type == pa_type
397 assert arr.to_pylist() == expected
398
399
400 @parametrize_with_sequence_types
401 def test_sequence_custom_integers(seq):
402 expected = [0, 42, 2**33 + 1, -2**63]
403 data = list(map(MyInt, expected))
404 arr = pa.array(seq(data), type=pa.int64())
405 assert arr.to_pylist() == expected
406
407
408 @parametrize_with_collections_types
409 def test_broken_integers(seq):
410 data = [MyBrokenInt()]
411 with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
412 pa.array(seq(data), type=pa.int64())
413
414
415 def test_numpy_scalars_mixed_type():
416 # ARROW-4324
417 data = [np.int32(10), np.float32(0.5)]
418 arr = pa.array(data)
419 expected = pa.array([10, 0.5], type="float64")
420 assert arr.equals(expected)
421
422 # ARROW-9490
423 data = [np.int8(10), np.float32(0.5)]
424 arr = pa.array(data)
425 expected = pa.array([10, 0.5], type="float32")
426 assert arr.equals(expected)
427
428
429 @pytest.mark.xfail(reason="Type inference for uint64 not implemented",
430 raises=OverflowError)
431 def test_uint64_max_convert():
432 data = [0, np.iinfo(np.uint64).max]
433
434 arr = pa.array(data, type=pa.uint64())
435 expected = pa.array(np.array(data, dtype='uint64'))
436 assert arr.equals(expected)
437
438 arr_inferred = pa.array(data)
439 assert arr_inferred.equals(expected)
440
441
442 @pytest.mark.parametrize("bits", [8, 16, 32, 64])
443 def test_signed_integer_overflow(bits):
444 ty = getattr(pa, "int%d" % bits)()
445 # XXX ideally would always raise OverflowError
446 with pytest.raises((OverflowError, pa.ArrowInvalid)):
447 pa.array([2 ** (bits - 1)], ty)
448 with pytest.raises((OverflowError, pa.ArrowInvalid)):
449 pa.array([-2 ** (bits - 1) - 1], ty)
450
451
452 @pytest.mark.parametrize("bits", [8, 16, 32, 64])
453 def test_unsigned_integer_overflow(bits):
454 ty = getattr(pa, "uint%d" % bits)()
455 # XXX ideally would always raise OverflowError
456 with pytest.raises((OverflowError, pa.ArrowInvalid)):
457 pa.array([2 ** bits], ty)
458 with pytest.raises((OverflowError, pa.ArrowInvalid)):
459 pa.array([-1], ty)
460
461
462 @parametrize_with_collections_types
463 @pytest.mark.parametrize("typ", pa_int_types)
464 def test_integer_from_string_error(seq, typ):
465 # ARROW-9451: pa.array(['1'], type=pa.uint32()) should not succeed
466 with pytest.raises(pa.ArrowInvalid):
467 pa.array(seq(['1']), type=typ)
468
469
470 def test_convert_with_mask():
471 data = [1, 2, 3, 4, 5]
472 mask = np.array([False, True, False, False, True])
473
474 result = pa.array(data, mask=mask)
475 expected = pa.array([1, None, 3, 4, None])
476
477 assert result.equals(expected)
478
479 # Mask wrong length
480 with pytest.raises(ValueError):
481 pa.array(data, mask=mask[1:])
482
483
484 def test_garbage_collection():
485 import gc
486
487 # Force the cyclic garbage collector to run
488 gc.collect()
489
490 bytes_before = pa.total_allocated_bytes()
491 pa.array([1, None, 3, None])
492 gc.collect()
493 assert pa.total_allocated_bytes() == bytes_before
494
495
496 def test_sequence_double():
497 data = [1.5, 1., None, 2.5, None, None]
498 arr = pa.array(data)
499 assert len(arr) == 6
500 assert arr.null_count == 3
501 assert arr.type == pa.float64()
502 assert arr.to_pylist() == data
503
504
505 def test_double_auto_coerce_from_integer():
506 # Done as part of ARROW-2814
507 data = [1.5, 1., None, 2.5, None, None]
508 arr = pa.array(data)
509
510 data2 = [1.5, 1, None, 2.5, None, None]
511 arr2 = pa.array(data2)
512
513 assert arr.equals(arr2)
514
515 data3 = [1, 1.5, None, 2.5, None, None]
516 arr3 = pa.array(data3)
517
518 data4 = [1., 1.5, None, 2.5, None, None]
519 arr4 = pa.array(data4)
520
521 assert arr3.equals(arr4)
522
523
524 def test_double_integer_coerce_representable_range():
525 valid_values = [1.5, 1, 2, None, 1 << 53, -(1 << 53)]
526 invalid_values = [1.5, 1, 2, None, (1 << 53) + 1]
527 invalid_values2 = [1.5, 1, 2, None, -((1 << 53) + 1)]
528
529 # it works
530 pa.array(valid_values)
531
532 # it fails
533 with pytest.raises(ValueError):
534 pa.array(invalid_values)
535
536 with pytest.raises(ValueError):
537 pa.array(invalid_values2)
538
539
540 def test_float32_integer_coerce_representable_range():
541 f32 = np.float32
542 valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
543 invalid_values = [f32(1.5), (1 << 24) + 1]
544 invalid_values2 = [f32(1.5), -((1 << 24) + 1)]
545
546 # it works
547 pa.array(valid_values, type=pa.float32())
548
549 # it fails
550 with pytest.raises(ValueError):
551 pa.array(invalid_values, type=pa.float32())
552
553 with pytest.raises(ValueError):
554 pa.array(invalid_values2, type=pa.float32())
555
556
557 def test_mixed_sequence_errors():
558 with pytest.raises(ValueError, match="tried to convert to boolean"):
559 pa.array([True, 'foo'], type=pa.bool_())
560
561 with pytest.raises(ValueError, match="tried to convert to float32"):
562 pa.array([1.5, 'foo'], type=pa.float32())
563
564 with pytest.raises(ValueError, match="tried to convert to double"):
565 pa.array([1.5, 'foo'])
566
567
568 @parametrize_with_sequence_types
569 @pytest.mark.parametrize("np_scalar,pa_type", [
570 (np.float16, pa.float16()),
571 (np.float32, pa.float32()),
572 (np.float64, pa.float64())
573 ])
574 @pytest.mark.parametrize("from_pandas", [True, False])
575 def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
576 data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
577 arr = pa.array(seq(data), from_pandas=from_pandas)
578 assert len(arr) == 6
579 if from_pandas:
580 assert arr.null_count == 3
581 else:
582 assert arr.null_count == 2
583 if from_pandas:
584 # The NaN is skipped in type inference, otherwise it forces a
585 # float64 promotion
586 assert arr.type == pa_type
587 else:
588 assert arr.type == pa.float64()
589
590 assert arr.to_pylist()[:4] == data[:4]
591 if from_pandas:
592 assert arr.to_pylist()[5] is None
593 else:
594 assert np.isnan(arr.to_pylist()[5])
595
596
597 @pytest.mark.parametrize("from_pandas", [True, False])
598 @pytest.mark.parametrize("inner_seq", [np.array, list])
599 def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
600 # ARROW-2806
601 data = np.array([
602 inner_seq([1., 2.]),
603 inner_seq([1., 2., 3.]),
604 inner_seq([np.nan]),
605 None
606 ], dtype=object)
607 arr = pa.array(data, from_pandas=from_pandas)
608 assert len(arr) == 4
609 assert arr.null_count == 1
610 assert arr.type == pa.list_(pa.float64())
611 if from_pandas:
612 assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
613 else:
614 np.testing.assert_equal(arr.to_pylist(),
615 [[1., 2.], [1., 2., 3.], [np.nan], None])
616
617
618 def test_nested_ndarray_in_object_array():
619 # ARROW-4350
620 arr = np.empty(2, dtype=object)
621 arr[:] = [np.array([1, 2], dtype=np.int64),
622 np.array([2, 3], dtype=np.int64)]
623
624 arr2 = np.empty(2, dtype=object)
625 arr2[0] = [3, 4]
626 arr2[1] = [5, 6]
627
628 expected_type = pa.list_(pa.list_(pa.int64()))
629 assert pa.infer_type([arr]) == expected_type
630
631 result = pa.array([arr, arr2])
632 expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
633 type=expected_type)
634
635 assert result.equals(expected)
636
637 # test case for len-1 arrays to ensure they are interpreted as
638 # sublists and not scalars
639 arr = np.empty(2, dtype=object)
640 arr[:] = [np.array([1]), np.array([2])]
641 result = pa.array([arr, arr])
642 assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
643
644
645 @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
646 "not yet implemented"),
647 raises=AssertionError)
648 def test_multidimensional_ndarray_as_nested_list():
649 # TODO(wesm): see ARROW-5645
650 arr = np.array([[1, 2], [2, 3]], dtype=np.int64)
651 arr2 = np.array([[3, 4], [5, 6]], dtype=np.int64)
652
653 expected_type = pa.list_(pa.list_(pa.int64()))
654 assert pa.infer_type([arr]) == expected_type
655
656 result = pa.array([arr, arr2])
657 expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
658 type=expected_type)
659
660 assert result.equals(expected)
661
662
663 @pytest.mark.parametrize(('data', 'value_type'), [
664 ([True, False], pa.bool_()),
665 ([None, None], pa.null()),
666 ([1, 2, None], pa.int8()),
667 ([1, 2., 3., None], pa.float32()),
668 ([datetime.date.today(), None], pa.date32()),
669 ([None, datetime.date.today()], pa.date64()),
670 ([datetime.time(1, 1, 1), None], pa.time32('s')),
671 ([None, datetime.time(2, 2, 2)], pa.time64('us')),
672 ([datetime.datetime.now(), None], pa.timestamp('us')),
673 ([datetime.timedelta(seconds=10)], pa.duration('s')),
674 ([b"a", b"b"], pa.binary()),
675 ([b"aaa", b"bbb", b"ccc"], pa.binary(3)),
676 ([b"a", b"b", b"c"], pa.large_binary()),
677 (["a", "b", "c"], pa.string()),
678 (["a", "b", "c"], pa.large_string()),
679 (
680 [{"a": 1, "b": 2}, None, {"a": 5, "b": None}],
681 pa.struct([('a', pa.int8()), ('b', pa.int16())])
682 )
683 ])
684 def test_list_array_from_object_ndarray(data, value_type):
685 ty = pa.list_(value_type)
686 ndarray = np.array(data, dtype=object)
687 arr = pa.array([ndarray], type=ty)
688 assert arr.type.equals(ty)
689 assert arr.to_pylist() == [data]
690
691
692 @pytest.mark.parametrize(('data', 'value_type'), [
693 ([[1, 2], [3]], pa.list_(pa.int64())),
694 ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
695 ([[1], [2, 3]], pa.large_list(pa.int64()))
696 ])
697 def test_nested_list_array_from_object_ndarray(data, value_type):
698 ndarray = np.empty(len(data), dtype=object)
699 ndarray[:] = [np.array(item, dtype=object) for item in data]
700
701 ty = pa.list_(value_type)
702 arr = pa.array([ndarray], type=ty)
703 assert arr.type.equals(ty)
704 assert arr.to_pylist() == [data]
705
706
707 def test_array_ignore_nan_from_pandas():
708 # See ARROW-4324, this reverts logic that was introduced in
709 # ARROW-2240
710 with pytest.raises(ValueError):
711 pa.array([np.nan, 'str'])
712
713 arr = pa.array([np.nan, 'str'], from_pandas=True)
714 expected = pa.array([None, 'str'])
715 assert arr.equals(expected)
716
717
718 def test_nested_ndarray_different_dtypes():
719 data = [
720 np.array([1, 2, 3], dtype='int64'),
721 None,
722 np.array([4, 5, 6], dtype='uint32')
723 ]
724
725 arr = pa.array(data)
726 expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
727 type=pa.list_(pa.int64()))
728 assert arr.equals(expected)
729
730 t2 = pa.list_(pa.uint32())
731 arr2 = pa.array(data, type=t2)
732 expected2 = expected.cast(t2)
733 assert arr2.equals(expected2)
734
735
736 def test_sequence_unicode():
737 data = ['foo', 'bar', None, 'mañana']
738 arr = pa.array(data)
739 assert len(arr) == 4
740 assert arr.null_count == 1
741 assert arr.type == pa.string()
742 assert arr.to_pylist() == data
743
744
745 def check_array_mixed_unicode_bytes(binary_type, string_type):
746 values = ['qux', b'foo', bytearray(b'barz')]
747 b_values = [b'qux', b'foo', b'barz']
748 u_values = ['qux', 'foo', 'barz']
749
750 arr = pa.array(values)
751 expected = pa.array(b_values, type=pa.binary())
752 assert arr.type == pa.binary()
753 assert arr.equals(expected)
754
755 arr = pa.array(values, type=binary_type)
756 expected = pa.array(b_values, type=binary_type)
757 assert arr.type == binary_type
758 assert arr.equals(expected)
759
760 arr = pa.array(values, type=string_type)
761 expected = pa.array(u_values, type=string_type)
762 assert arr.type == string_type
763 assert arr.equals(expected)
764
765
766 def test_array_mixed_unicode_bytes():
767 check_array_mixed_unicode_bytes(pa.binary(), pa.string())
768 check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
769
770
771 @pytest.mark.large_memory
772 @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
773 def test_large_binary_array(ty):
774 # Construct a large binary array with more than 4GB of data
775 s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10
776 nrepeats = math.ceil((2**32 + 5) / len(s))
777 data = [s] * nrepeats
778 arr = pa.array(data, type=ty)
779 assert isinstance(arr, pa.Array)
780 assert arr.type == ty
781 assert len(arr) == nrepeats
782
783
784 @pytest.mark.slow
785 @pytest.mark.large_memory
786 @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
787 def test_large_binary_value(ty):
788 # Construct a large binary array with a single value larger than 4GB
789 s = b"0123456789abcdefghijklmnopqrstuvwxyz"
790 nrepeats = math.ceil((2**32 + 5) / len(s))
791 arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
792 assert isinstance(arr, pa.Array)
793 assert arr.type == ty
794 assert len(arr) == 4
795 buf = arr[1].as_buffer()
796 assert len(buf) == len(s) * nrepeats
797
798
799 @pytest.mark.large_memory
800 @pytest.mark.parametrize("ty", [pa.binary(), pa.string()])
801 def test_string_too_large(ty):
802 # Construct a binary array with a single value larger than 4GB
803 s = b"0123456789abcdefghijklmnopqrstuvwxyz"
804 nrepeats = math.ceil((2**32 + 5) / len(s))
805 with pytest.raises(pa.ArrowCapacityError):
806 pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
807
808
809 def test_sequence_bytes():
810 u1 = b'ma\xc3\xb1ana'
811
812 data = [b'foo',
813 memoryview(b'dada'),
814 memoryview(b'd-a-t-a')[::2], # non-contiguous is made contiguous
815 u1.decode('utf-8'), # unicode gets encoded,
816 bytearray(b'bar'),
817 None]
818 for ty in [None, pa.binary(), pa.large_binary()]:
819 arr = pa.array(data, type=ty)
820 assert len(arr) == 6
821 assert arr.null_count == 1
822 assert arr.type == ty or pa.binary()
823 assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None]
824
825
826 @pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
827 def test_sequence_utf8_to_unicode(ty):
828 # ARROW-1225
829 data = [b'foo', None, b'bar']
830 arr = pa.array(data, type=ty)
831 assert arr.type == ty
832 assert arr[0].as_py() == 'foo'
833
834 # test a non-utf8 unicode string
835 val = ('mañana').encode('utf-16-le')
836 with pytest.raises(pa.ArrowInvalid):
837 pa.array([val], type=ty)
838
839
840 def test_sequence_fixed_size_bytes():
841 data = [b'foof', None, bytearray(b'barb'), b'2346']
842 arr = pa.array(data, type=pa.binary(4))
843 assert len(arr) == 4
844 assert arr.null_count == 1
845 assert arr.type == pa.binary(4)
846 assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
847
848
849 def test_fixed_size_bytes_does_not_accept_varying_lengths():
850 data = [b'foo', None, b'barb', b'2346']
851 with pytest.raises(pa.ArrowInvalid):
852 pa.array(data, type=pa.binary(4))
853
854
855 def test_fixed_size_binary_length_check():
856 # ARROW-10193
857 data = [b'\x19h\r\x9e\x00\x00\x00\x00\x01\x9b\x9fA']
858 assert len(data[0]) == 12
859 ty = pa.binary(12)
860 arr = pa.array(data, type=ty)
861 assert arr.to_pylist() == data
862
863
864 def test_sequence_date():
865 data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
866 datetime.date(2040, 2, 26)]
867 arr = pa.array(data)
868 assert len(arr) == 4
869 assert arr.type == pa.date32()
870 assert arr.null_count == 1
871 assert arr[0].as_py() == datetime.date(2000, 1, 1)
872 assert arr[1].as_py() is None
873 assert arr[2].as_py() == datetime.date(1970, 1, 1)
874 assert arr[3].as_py() == datetime.date(2040, 2, 26)
875
876
877 @pytest.mark.parametrize('input',
878 [(pa.date32(), [10957, None]),
879 (pa.date64(), [10957 * 86400000, None])])
880 def test_sequence_explicit_types(input):
881 t, ex_values = input
882 data = [datetime.date(2000, 1, 1), None]
883 arr = pa.array(data, type=t)
884 arr2 = pa.array(ex_values, type=t)
885
886 for x in [arr, arr2]:
887 assert len(x) == 2
888 assert x.type == t
889 assert x.null_count == 1
890 assert x[0].as_py() == datetime.date(2000, 1, 1)
891 assert x[1].as_py() is None
892
893
894 def test_date32_overflow():
895 # Overflow
896 data3 = [2**32, None]
897 with pytest.raises((OverflowError, pa.ArrowException)):
898 pa.array(data3, type=pa.date32())
899
900
901 @pytest.mark.parametrize(('time_type', 'unit', 'int_type'), [
902 (pa.time32, 's', 'int32'),
903 (pa.time32, 'ms', 'int32'),
904 (pa.time64, 'us', 'int64'),
905 (pa.time64, 'ns', 'int64'),
906 ])
907 def test_sequence_time_with_timezone(time_type, unit, int_type):
908 def expected_integer_value(t):
909 # only use with utc time object because it doesn't adjust with the
910 # offset
911 units = ['s', 'ms', 'us', 'ns']
912 multiplier = 10**(units.index(unit) * 3)
913 if t is None:
914 return None
915 seconds = (
916 t.hour * 3600 +
917 t.minute * 60 +
918 t.second +
919 t.microsecond * 10**-6
920 )
921 return int(seconds * multiplier)
922
923 def expected_time_value(t):
924 # only use with utc time object because it doesn't adjust with the
925 # time objects tzdata
926 if unit == 's':
927 return t.replace(microsecond=0)
928 elif unit == 'ms':
929 return t.replace(microsecond=(t.microsecond // 1000) * 1000)
930 else:
931 return t
932
933 # only timezone naive times are supported in arrow
934 data = [
935 datetime.time(8, 23, 34, 123456),
936 datetime.time(5, 0, 0, 1000),
937 None,
938 datetime.time(1, 11, 56, 432539),
939 datetime.time(23, 10, 0, 437699)
940 ]
941
942 ty = time_type(unit)
943 arr = pa.array(data, type=ty)
944 assert len(arr) == 5
945 assert arr.type == ty
946 assert arr.null_count == 1
947
948 # test that the underlying integers are UTC values
949 values = arr.cast(int_type)
950 expected = list(map(expected_integer_value, data))
951 assert values.to_pylist() == expected
952
953 # test that the scalars are datetime.time objects with UTC timezone
954 assert arr[0].as_py() == expected_time_value(data[0])
955 assert arr[1].as_py() == expected_time_value(data[1])
956 assert arr[2].as_py() is None
957 assert arr[3].as_py() == expected_time_value(data[3])
958 assert arr[4].as_py() == expected_time_value(data[4])
959
960 def tz(hours, minutes=0):
961 offset = datetime.timedelta(hours=hours, minutes=minutes)
962 return datetime.timezone(offset)
963
964
965 def test_sequence_timestamp():
966 data = [
967 datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
968 None,
969 datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
970 datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
971 ]
972 arr = pa.array(data)
973 assert len(arr) == 4
974 assert arr.type == pa.timestamp('us')
975 assert arr.null_count == 1
976 assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
977 23, 34, 123456)
978 assert arr[1].as_py() is None
979 assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
980 34, 56, 432539)
981 assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
982 46, 57, 437699)
983
984
985 @pytest.mark.parametrize('timezone', [
986 None,
987 'UTC',
988 'Etc/GMT-1',
989 'Europe/Budapest',
990 ])
991 @pytest.mark.parametrize('unit', [
992 's',
993 'ms',
994 'us',
995 'ns'
996 ])
997 def test_sequence_timestamp_with_timezone(timezone, unit):
998 def expected_integer_value(dt):
999 units = ['s', 'ms', 'us', 'ns']
1000 multiplier = 10**(units.index(unit) * 3)
1001 if dt is None:
1002 return None
1003 else:
1004 # avoid float precision issues
1005 ts = decimal.Decimal(str(dt.timestamp()))
1006 return int(ts * multiplier)
1007
1008 def expected_datetime_value(dt):
1009 if dt is None:
1010 return None
1011
1012 if unit == 's':
1013 dt = dt.replace(microsecond=0)
1014 elif unit == 'ms':
1015 dt = dt.replace(microsecond=(dt.microsecond // 1000) * 1000)
1016
1017 # adjust the timezone
1018 if timezone is None:
1019 # make datetime timezone unaware
1020 return dt.replace(tzinfo=None)
1021 else:
1022 # convert to the expected timezone
1023 return dt.astimezone(pytz.timezone(timezone))
1024
1025 data = [
1026 datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1027 pytz.utc.localize(
1028 datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1029 ),
1030 None,
1031 pytz.timezone('US/Eastern').localize(
1032 datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1033 ),
1034 pytz.timezone('Europe/Moscow').localize(
1035 datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1036 ),
1037 ]
1038 utcdata = [
1039 pytz.utc.localize(data[0]),
1040 data[1],
1041 None,
1042 data[3].astimezone(pytz.utc),
1043 data[4].astimezone(pytz.utc),
1044 ]
1045
1046 ty = pa.timestamp(unit, tz=timezone)
1047 arr = pa.array(data, type=ty)
1048 assert len(arr) == 5
1049 assert arr.type == ty
1050 assert arr.null_count == 1
1051
1052 # test that the underlying integers are UTC values
1053 values = arr.cast('int64')
1054 expected = list(map(expected_integer_value, utcdata))
1055 assert values.to_pylist() == expected
1056
1057 # test that the scalars are datetimes with the correct timezone
1058 for i in range(len(arr)):
1059 assert arr[i].as_py() == expected_datetime_value(utcdata[i])
1060
1061
1062 @pytest.mark.parametrize('timezone', [
1063 None,
1064 'UTC',
1065 'Etc/GMT-1',
1066 'Europe/Budapest',
1067 ])
1068 def test_pyarrow_ignore_timezone_environment_variable(monkeypatch, timezone):
1069 # note that any non-empty value will evaluate to true
1070 monkeypatch.setenv("PYARROW_IGNORE_TIMEZONE", "1")
1071 data = [
1072 datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1073 pytz.utc.localize(
1074 datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1075 ),
1076 pytz.timezone('US/Eastern').localize(
1077 datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1078 ),
1079 pytz.timezone('Europe/Moscow').localize(
1080 datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1081 ),
1082 ]
1083
1084 expected = [dt.replace(tzinfo=None) for dt in data]
1085 if timezone is not None:
1086 tzinfo = pytz.timezone(timezone)
1087 expected = [tzinfo.fromutc(dt) for dt in expected]
1088
1089 ty = pa.timestamp('us', tz=timezone)
1090 arr = pa.array(data, type=ty)
1091 assert arr.to_pylist() == expected
1092
1093
1094 def test_sequence_timestamp_with_timezone_inference():
1095 data = [
1096 datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1097 pytz.utc.localize(
1098 datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1099 ),
1100 None,
1101 pytz.timezone('US/Eastern').localize(
1102 datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1103 ),
1104 pytz.timezone('Europe/Moscow').localize(
1105 datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1106 ),
1107 ]
1108 expected = [
1109 pa.timestamp('us', tz=None),
1110 pa.timestamp('us', tz='UTC'),
1111 pa.timestamp('us', tz=None),
1112 pa.timestamp('us', tz='US/Eastern'),
1113 pa.timestamp('us', tz='Europe/Moscow')
1114 ]
1115 for dt, expected_type in zip(data, expected):
1116 prepended = [dt] + data
1117 arr = pa.array(prepended)
1118 assert arr.type == expected_type
1119
1120
1121 @pytest.mark.pandas
1122 def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes():
1123 import pandas as pd
1124
1125 data = [
1126 pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'),
1127 unit='ns'),
1128 datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive
1129 pytz.utc.localize(
1130 datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1131 ),
1132 None,
1133 ]
1134 utcdata = [
1135 data[0].astimezone(pytz.utc),
1136 pytz.utc.localize(data[1]),
1137 data[2].astimezone(pytz.utc),
1138 None,
1139 ]
1140
1141 arr = pa.array(data)
1142 assert arr.type == pa.timestamp('us', tz='US/Eastern')
1143
1144 values = arr.cast('int64')
1145 expected = [int(dt.timestamp() * 10**6) if dt else None for dt in utcdata]
1146 assert values.to_pylist() == expected
1147
1148
1149 def test_sequence_timestamp_out_of_bounds_nanosecond():
1150 # https://issues.apache.org/jira/browse/ARROW-9768
1151 # datetime outside of range supported for nanosecond resolution
1152 data = [datetime.datetime(2262, 4, 12)]
1153 with pytest.raises(ValueError, match="out of bounds"):
1154 pa.array(data, type=pa.timestamp('ns'))
1155
1156 # with microsecond resolution it works fine
1157 arr = pa.array(data, type=pa.timestamp('us'))
1158 assert arr.to_pylist() == data
1159
1160 # case where the naive is within bounds, but converted to UTC not
1161 tz = datetime.timezone(datetime.timedelta(hours=-1))
1162 data = [datetime.datetime(2262, 4, 11, 23, tzinfo=tz)]
1163 with pytest.raises(ValueError, match="out of bounds"):
1164 pa.array(data, type=pa.timestamp('ns'))
1165
1166 arr = pa.array(data, type=pa.timestamp('us'))
1167 assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)
1168
1169
1170 def test_sequence_numpy_timestamp():
1171 data = [
1172 np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
1173 None,
1174 np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
1175 np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
1176 ]
1177 arr = pa.array(data)
1178 assert len(arr) == 4
1179 assert arr.type == pa.timestamp('us')
1180 assert arr.null_count == 1
1181 assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1182 23, 34, 123456)
1183 assert arr[1].as_py() is None
1184 assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
1185 34, 56, 432539)
1186 assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
1187 46, 57, 437699)
1188
1189
1190 class MyDate(datetime.date):
1191 pass
1192
1193
1194 class MyDatetime(datetime.datetime):
1195 pass
1196
1197
1198 class MyTimedelta(datetime.timedelta):
1199 pass
1200
1201
1202 def test_datetime_subclassing():
1203 data = [
1204 MyDate(2007, 7, 13),
1205 ]
1206 date_type = pa.date32()
1207 arr_date = pa.array(data, type=date_type)
1208 assert len(arr_date) == 1
1209 assert arr_date.type == date_type
1210 assert arr_date[0].as_py() == datetime.date(2007, 7, 13)
1211
1212 data = [
1213 MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
1214 ]
1215
1216 s = pa.timestamp('s')
1217 ms = pa.timestamp('ms')
1218 us = pa.timestamp('us')
1219
1220 arr_s = pa.array(data, type=s)
1221 assert len(arr_s) == 1
1222 assert arr_s.type == s
1223 assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1224 23, 34, 0)
1225
1226 arr_ms = pa.array(data, type=ms)
1227 assert len(arr_ms) == 1
1228 assert arr_ms.type == ms
1229 assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1230 23, 34, 123000)
1231
1232 arr_us = pa.array(data, type=us)
1233 assert len(arr_us) == 1
1234 assert arr_us.type == us
1235 assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1236 23, 34, 123456)
1237
1238 data = [
1239 MyTimedelta(123, 456, 1002),
1240 ]
1241
1242 s = pa.duration('s')
1243 ms = pa.duration('ms')
1244 us = pa.duration('us')
1245
1246 arr_s = pa.array(data)
1247 assert len(arr_s) == 1
1248 assert arr_s.type == us
1249 assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002)
1250
1251 arr_s = pa.array(data, type=s)
1252 assert len(arr_s) == 1
1253 assert arr_s.type == s
1254 assert arr_s[0].as_py() == datetime.timedelta(123, 456)
1255
1256 arr_ms = pa.array(data, type=ms)
1257 assert len(arr_ms) == 1
1258 assert arr_ms.type == ms
1259 assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000)
1260
1261 arr_us = pa.array(data, type=us)
1262 assert len(arr_us) == 1
1263 assert arr_us.type == us
1264 assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
1265
1266
1267 @pytest.mark.xfail(not _pandas_api.have_pandas,
1268 reason="pandas required for nanosecond conversion")
1269 def test_sequence_timestamp_nanoseconds():
1270 inputs = [
1271 [datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)],
1272 [MyDatetime(2007, 7, 13, 1, 23, 34, 123456)]
1273 ]
1274
1275 for data in inputs:
1276 ns = pa.timestamp('ns')
1277 arr_ns = pa.array(data, type=ns)
1278 assert len(arr_ns) == 1
1279 assert arr_ns.type == ns
1280 assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1281 23, 34, 123456)
1282
1283
1284 @pytest.mark.pandas
1285 def test_sequence_timestamp_from_int_with_unit():
1286 # TODO(wesm): This test might be rewritten to assert the actual behavior
1287 # when pandas is not installed
1288
1289 data = [1]
1290
1291 s = pa.timestamp('s')
1292 ms = pa.timestamp('ms')
1293 us = pa.timestamp('us')
1294 ns = pa.timestamp('ns')
1295
1296 arr_s = pa.array(data, type=s)
1297 assert len(arr_s) == 1
1298 assert arr_s.type == s
1299 assert repr(arr_s[0]) == (
1300 "<pyarrow.TimestampScalar: datetime.datetime(1970, 1, 1, 0, 0, 1)>"
1301 )
1302 assert str(arr_s[0]) == "1970-01-01 00:00:01"
1303
1304 arr_ms = pa.array(data, type=ms)
1305 assert len(arr_ms) == 1
1306 assert arr_ms.type == ms
1307 assert repr(arr_ms[0].as_py()) == (
1308 "datetime.datetime(1970, 1, 1, 0, 0, 0, 1000)"
1309 )
1310 assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000"
1311
1312 arr_us = pa.array(data, type=us)
1313 assert len(arr_us) == 1
1314 assert arr_us.type == us
1315 assert repr(arr_us[0].as_py()) == (
1316 "datetime.datetime(1970, 1, 1, 0, 0, 0, 1)"
1317 )
1318 assert str(arr_us[0]) == "1970-01-01 00:00:00.000001"
1319
1320 arr_ns = pa.array(data, type=ns)
1321 assert len(arr_ns) == 1
1322 assert arr_ns.type == ns
1323 assert repr(arr_ns[0].as_py()) == (
1324 "Timestamp('1970-01-01 00:00:00.000000001')"
1325 )
1326 assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001"
1327
1328 expected_exc = TypeError
1329
1330 class CustomClass():
1331 pass
1332
1333 for ty in [ns, pa.date32(), pa.date64()]:
1334 with pytest.raises(expected_exc):
1335 pa.array([1, CustomClass()], type=ty)
1336
1337
1338 @pytest.mark.parametrize('np_scalar', [True, False])
1339 def test_sequence_duration(np_scalar):
1340 td1 = datetime.timedelta(2, 3601, 1)
1341 td2 = datetime.timedelta(1, 100, 1000)
1342 if np_scalar:
1343 data = [np.timedelta64(td1), None, np.timedelta64(td2)]
1344 else:
1345 data = [td1, None, td2]
1346
1347 arr = pa.array(data)
1348 assert len(arr) == 3
1349 assert arr.type == pa.duration('us')
1350 assert arr.null_count == 1
1351 assert arr[0].as_py() == td1
1352 assert arr[1].as_py() is None
1353 assert arr[2].as_py() == td2
1354
1355
1356 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1357 def test_sequence_duration_with_unit(unit):
1358 data = [
1359 datetime.timedelta(3, 22, 1001),
1360 ]
1361 expected = {'s': datetime.timedelta(3, 22),
1362 'ms': datetime.timedelta(3, 22, 1000),
1363 'us': datetime.timedelta(3, 22, 1001),
1364 'ns': datetime.timedelta(3, 22, 1001)}
1365
1366 ty = pa.duration(unit)
1367
1368 arr_s = pa.array(data, type=ty)
1369 assert len(arr_s) == 1
1370 assert arr_s.type == ty
1371 assert arr_s[0].as_py() == expected[unit]
1372
1373
1374 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1375 def test_sequence_duration_from_int_with_unit(unit):
1376 data = [5]
1377
1378 ty = pa.duration(unit)
1379 arr = pa.array(data, type=ty)
1380 assert len(arr) == 1
1381 assert arr.type == ty
1382 assert arr[0].value == 5
1383
1384
1385 def test_sequence_duration_nested_lists():
1386 td1 = datetime.timedelta(1, 1, 1000)
1387 td2 = datetime.timedelta(1, 100)
1388
1389 data = [[td1, None], [td1, td2]]
1390
1391 arr = pa.array(data)
1392 assert len(arr) == 2
1393 assert arr.type == pa.list_(pa.duration('us'))
1394 assert arr.to_pylist() == data
1395
1396 arr = pa.array(data, type=pa.list_(pa.duration('ms')))
1397 assert len(arr) == 2
1398 assert arr.type == pa.list_(pa.duration('ms'))
1399 assert arr.to_pylist() == data
1400
1401
1402 def test_sequence_duration_nested_lists_numpy():
1403 td1 = datetime.timedelta(1, 1, 1000)
1404 td2 = datetime.timedelta(1, 100)
1405
1406 data = [[np.timedelta64(td1), None],
1407 [np.timedelta64(td1), np.timedelta64(td2)]]
1408
1409 arr = pa.array(data)
1410 assert len(arr) == 2
1411 assert arr.type == pa.list_(pa.duration('us'))
1412 assert arr.to_pylist() == [[td1, None], [td1, td2]]
1413
1414 data = [np.array([np.timedelta64(td1), None], dtype='timedelta64[us]'),
1415 np.array([np.timedelta64(td1), np.timedelta64(td2)])]
1416
1417 arr = pa.array(data)
1418 assert len(arr) == 2
1419 assert arr.type == pa.list_(pa.duration('us'))
1420 assert arr.to_pylist() == [[td1, None], [td1, td2]]
1421
1422
1423 def test_sequence_nesting_levels():
1424 data = [1, 2, None]
1425 arr = pa.array(data)
1426 assert arr.type == pa.int64()
1427 assert arr.to_pylist() == data
1428
1429 data = [[1], [2], None]
1430 arr = pa.array(data)
1431 assert arr.type == pa.list_(pa.int64())
1432 assert arr.to_pylist() == data
1433
1434 data = [[1], [2, 3, 4], [None]]
1435 arr = pa.array(data)
1436 assert arr.type == pa.list_(pa.int64())
1437 assert arr.to_pylist() == data
1438
1439 data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
1440 arr = pa.array(data)
1441 assert arr.type == pa.list_(pa.list_(pa.int64()))
1442 assert arr.to_pylist() == data
1443
1444 exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)
1445
1446 # Mixed nesting levels are rejected
1447 with pytest.raises(exceptions):
1448 pa.array([1, 2, [1]])
1449
1450 with pytest.raises(exceptions):
1451 pa.array([1, 2, []])
1452
1453 with pytest.raises(exceptions):
1454 pa.array([[1], [2], [None, [1]]])
1455
1456
1457 def test_sequence_mixed_types_fails():
1458 data = ['a', 1, 2.0]
1459 with pytest.raises(pa.ArrowTypeError):
1460 pa.array(data)
1461
1462
1463 def test_sequence_mixed_types_with_specified_type_fails():
1464 data = ['-10', '-5', {'a': 1}, '0', '5', '10']
1465
1466 type = pa.string()
1467 with pytest.raises(TypeError):
1468 pa.array(data, type=type)
1469
1470
1471 def test_sequence_decimal():
1472 data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
1473 for type in [pa.decimal128, pa.decimal256]:
1474 arr = pa.array(data, type=type(precision=7, scale=3))
1475 assert arr.to_pylist() == data
1476
1477
1478 def test_sequence_decimal_different_precisions():
1479 data = [
1480 decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
1481 ]
1482 for type in [pa.decimal128, pa.decimal256]:
1483 arr = pa.array(data, type=type(precision=13, scale=3))
1484 assert arr.to_pylist() == data
1485
1486
1487 def test_sequence_decimal_no_scale():
1488 data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
1489 for type in [pa.decimal128, pa.decimal256]:
1490 arr = pa.array(data, type=type(precision=10))
1491 assert arr.to_pylist() == data
1492
1493
1494 def test_sequence_decimal_negative():
1495 data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
1496 for type in [pa.decimal128, pa.decimal256]:
1497 arr = pa.array(data, type=type(precision=10, scale=6))
1498 assert arr.to_pylist() == data
1499
1500
1501 def test_sequence_decimal_no_whole_part():
1502 data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
1503 for type in [pa.decimal128, pa.decimal256]:
1504 arr = pa.array(data, type=type(precision=7, scale=7))
1505 assert arr.to_pylist() == data
1506
1507
1508 def test_sequence_decimal_large_integer():
1509 data = [decimal.Decimal('-394029506937548693.42983'),
1510 decimal.Decimal('32358695912932.01033')]
1511 for type in [pa.decimal128, pa.decimal256]:
1512 arr = pa.array(data, type=type(precision=23, scale=5))
1513 assert arr.to_pylist() == data
1514
1515
1516 def test_sequence_decimal_from_integers():
1517 data = [0, 1, -39402950693754869342983]
1518 expected = [decimal.Decimal(x) for x in data]
1519 for type in [pa.decimal128, pa.decimal256]:
1520 arr = pa.array(data, type=type(precision=28, scale=5))
1521 assert arr.to_pylist() == expected
1522
1523
1524 def test_sequence_decimal_too_high_precision():
1525 # ARROW-6989 python decimal has too high precision
1526 with pytest.raises(ValueError, match="precision out of range"):
1527 pa.array([decimal.Decimal('1' * 80)])
1528
1529
1530 def test_sequence_decimal_infer():
1531 for data, typ in [
1532 # simple case
1533 (decimal.Decimal('1.234'), pa.decimal128(4, 3)),
1534 # trailing zeros
1535 (decimal.Decimal('12300'), pa.decimal128(5, 0)),
1536 (decimal.Decimal('12300.0'), pa.decimal128(6, 1)),
1537 # scientific power notation
1538 (decimal.Decimal('1.23E+4'), pa.decimal128(5, 0)),
1539 (decimal.Decimal('123E+2'), pa.decimal128(5, 0)),
1540 (decimal.Decimal('123E+4'), pa.decimal128(7, 0)),
1541 # leading zeros
1542 (decimal.Decimal('0.0123'), pa.decimal128(4, 4)),
1543 (decimal.Decimal('0.01230'), pa.decimal128(5, 5)),
1544 (decimal.Decimal('1.230E-2'), pa.decimal128(5, 5)),
1545 ]:
1546 assert pa.infer_type([data]) == typ
1547 arr = pa.array([data])
1548 assert arr.type == typ
1549 assert arr.to_pylist()[0] == data
1550
1551
1552 def test_sequence_decimal_infer_mixed():
1553 # ARROW-12150 - ensure mixed precision gets correctly inferred to
1554 # common type that can hold all input values
1555 cases = [
1556 ([decimal.Decimal('1.234'), decimal.Decimal('3.456')],
1557 pa.decimal128(4, 3)),
1558 ([decimal.Decimal('1.234'), decimal.Decimal('456.7')],
1559 pa.decimal128(6, 3)),
1560 ([decimal.Decimal('123.4'), decimal.Decimal('4.567')],
1561 pa.decimal128(6, 3)),
1562 ([decimal.Decimal('123e2'), decimal.Decimal('4567e3')],
1563 pa.decimal128(7, 0)),
1564 ([decimal.Decimal('123e4'), decimal.Decimal('4567e2')],
1565 pa.decimal128(7, 0)),
1566 ([decimal.Decimal('0.123'), decimal.Decimal('0.04567')],
1567 pa.decimal128(5, 5)),
1568 ([decimal.Decimal('0.001'), decimal.Decimal('1.01E5')],
1569 pa.decimal128(9, 3)),
1570 ]
1571 for data, typ in cases:
1572 assert pa.infer_type(data) == typ
1573 arr = pa.array(data)
1574 assert arr.type == typ
1575 assert arr.to_pylist() == data
1576
1577
1578 def test_sequence_decimal_given_type():
1579 for data, typs, wrong_typs in [
1580 # simple case
1581 (
1582 decimal.Decimal('1.234'),
1583 [pa.decimal128(4, 3), pa.decimal128(5, 3), pa.decimal128(5, 4)],
1584 [pa.decimal128(4, 2), pa.decimal128(4, 4)]
1585 ),
1586 # trailing zeros
1587 (
1588 decimal.Decimal('12300'),
1589 [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)],
1590 [pa.decimal128(4, 0), pa.decimal128(3, -3)]
1591 ),
1592 # scientific power notation
1593 (
1594 decimal.Decimal('1.23E+4'),
1595 [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)],
1596 [pa.decimal128(4, 0), pa.decimal128(3, -3)]
1597 ),
1598 ]:
1599 for typ in typs:
1600 arr = pa.array([data], type=typ)
1601 assert arr.type == typ
1602 assert arr.to_pylist()[0] == data
1603 for typ in wrong_typs:
1604 with pytest.raises(ValueError):
1605 pa.array([data], type=typ)
1606
1607
1608 def test_range_types():
1609 arr1 = pa.array(range(3))
1610 arr2 = pa.array((0, 1, 2))
1611 assert arr1.equals(arr2)
1612
1613
1614 def test_empty_range():
1615 arr = pa.array(range(0))
1616 assert len(arr) == 0
1617 assert arr.null_count == 0
1618 assert arr.type == pa.null()
1619 assert arr.to_pylist() == []
1620
1621
1622 def test_structarray():
1623 arr = pa.StructArray.from_arrays([], names=[])
1624 assert arr.type == pa.struct([])
1625 assert len(arr) == 0
1626 assert arr.to_pylist() == []
1627
1628 ints = pa.array([None, 2, 3], type=pa.int64())
1629 strs = pa.array(['a', None, 'c'], type=pa.string())
1630 bools = pa.array([True, False, None], type=pa.bool_())
1631 arr = pa.StructArray.from_arrays(
1632 [ints, strs, bools],
1633 ['ints', 'strs', 'bools'])
1634
1635 expected = [
1636 {'ints': None, 'strs': 'a', 'bools': True},
1637 {'ints': 2, 'strs': None, 'bools': False},
1638 {'ints': 3, 'strs': 'c', 'bools': None},
1639 ]
1640
1641 pylist = arr.to_pylist()
1642 assert pylist == expected, (pylist, expected)
1643
1644 # len(names) != len(arrays)
1645 with pytest.raises(ValueError):
1646 pa.StructArray.from_arrays([ints], ['ints', 'strs'])
1647
1648
1649 def test_struct_from_dicts():
1650 ty = pa.struct([pa.field('a', pa.int32()),
1651 pa.field('b', pa.string()),
1652 pa.field('c', pa.bool_())])
1653 arr = pa.array([], type=ty)
1654 assert arr.to_pylist() == []
1655
1656 data = [{'a': 5, 'b': 'foo', 'c': True},
1657 {'a': 6, 'b': 'bar', 'c': False}]
1658 arr = pa.array(data, type=ty)
1659 assert arr.to_pylist() == data
1660
1661 # With omitted values
1662 data = [{'a': 5, 'c': True},
1663 None,
1664 {},
1665 {'a': None, 'b': 'bar'}]
1666 arr = pa.array(data, type=ty)
1667 expected = [{'a': 5, 'b': None, 'c': True},
1668 None,
1669 {'a': None, 'b': None, 'c': None},
1670 {'a': None, 'b': 'bar', 'c': None}]
1671 assert arr.to_pylist() == expected
1672
1673
1674 def test_struct_from_dicts_bytes_keys():
1675 # ARROW-6878
1676 ty = pa.struct([pa.field('a', pa.int32()),
1677 pa.field('b', pa.string()),
1678 pa.field('c', pa.bool_())])
1679 arr = pa.array([], type=ty)
1680 assert arr.to_pylist() == []
1681
1682 data = [{b'a': 5, b'b': 'foo'},
1683 {b'a': 6, b'c': False}]
1684 arr = pa.array(data, type=ty)
1685 assert arr.to_pylist() == [
1686 {'a': 5, 'b': 'foo', 'c': None},
1687 {'a': 6, 'b': None, 'c': False},
1688 ]
1689
1690
1691 def test_struct_from_tuples():
1692 ty = pa.struct([pa.field('a', pa.int32()),
1693 pa.field('b', pa.string()),
1694 pa.field('c', pa.bool_())])
1695
1696 data = [(5, 'foo', True),
1697 (6, 'bar', False)]
1698 expected = [{'a': 5, 'b': 'foo', 'c': True},
1699 {'a': 6, 'b': 'bar', 'c': False}]
1700 arr = pa.array(data, type=ty)
1701
1702 data_as_ndarray = np.empty(len(data), dtype=object)
1703 data_as_ndarray[:] = data
1704 arr2 = pa.array(data_as_ndarray, type=ty)
1705 assert arr.to_pylist() == expected
1706
1707 assert arr.equals(arr2)
1708
1709 # With omitted values
1710 data = [(5, 'foo', None),
1711 None,
1712 (6, None, False)]
1713 expected = [{'a': 5, 'b': 'foo', 'c': None},
1714 None,
1715 {'a': 6, 'b': None, 'c': False}]
1716 arr = pa.array(data, type=ty)
1717 assert arr.to_pylist() == expected
1718
1719 # Invalid tuple size
1720 for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
1721 with pytest.raises(ValueError, match="(?i)tuple size"):
1722 pa.array([tup], type=ty)
1723
1724
1725 def test_struct_from_list_of_pairs():
1726 ty = pa.struct([
1727 pa.field('a', pa.int32()),
1728 pa.field('b', pa.string()),
1729 pa.field('c', pa.bool_())
1730 ])
1731 data = [
1732 [('a', 5), ('b', 'foo'), ('c', True)],
1733 [('a', 6), ('b', 'bar'), ('c', False)],
1734 None
1735 ]
1736 arr = pa.array(data, type=ty)
1737 assert arr.to_pylist() == [
1738 {'a': 5, 'b': 'foo', 'c': True},
1739 {'a': 6, 'b': 'bar', 'c': False},
1740 None
1741 ]
1742
1743 # test with duplicated field names
1744 ty = pa.struct([
1745 pa.field('a', pa.int32()),
1746 pa.field('a', pa.string()),
1747 pa.field('b', pa.bool_())
1748 ])
1749 data = [
1750 [('a', 5), ('a', 'foo'), ('b', True)],
1751 [('a', 6), ('a', 'bar'), ('b', False)],
1752 ]
1753 arr = pa.array(data, type=ty)
1754 with pytest.raises(ValueError):
1755 # TODO(kszucs): ARROW-9997
1756 arr.to_pylist()
1757
1758 # test with empty elements
1759 ty = pa.struct([
1760 pa.field('a', pa.int32()),
1761 pa.field('b', pa.string()),
1762 pa.field('c', pa.bool_())
1763 ])
1764 data = [
1765 [],
1766 [('a', 5), ('b', 'foo'), ('c', True)],
1767 [('a', 2), ('b', 'baz')],
1768 [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')],
1769 ]
1770 expected = [
1771 {'a': None, 'b': None, 'c': None},
1772 {'a': 5, 'b': 'foo', 'c': True},
1773 {'a': 2, 'b': 'baz', 'c': None},
1774 {'a': 1, 'b': 'bar', 'c': False},
1775 ]
1776 arr = pa.array(data, type=ty)
1777 assert arr.to_pylist() == expected
1778
1779
1780 def test_struct_from_list_of_pairs_errors():
1781 ty = pa.struct([
1782 pa.field('a', pa.int32()),
1783 pa.field('b', pa.string()),
1784 pa.field('c', pa.bool_())
1785 ])
1786
1787 # test that it raises if the key doesn't match the expected field name
1788 data = [
1789 [],
1790 [('a', 5), ('c', True), ('b', None)],
1791 ]
1792 msg = "The expected field name is `b` but `c` was given"
1793 with pytest.raises(ValueError, match=msg):
1794 pa.array(data, type=ty)
1795
1796 # test various errors both at the first position and after because of key
1797 # type inference
1798 template = (
1799 r"Could not convert {} with type {}: was expecting tuple of "
1800 r"(key, value) pair"
1801 )
1802 cases = [
1803 tuple(), # empty key-value pair
1804 tuple('a',), # missing value
1805 tuple('unknown-key',), # not known field name
1806 'string', # not a tuple
1807 ]
1808 for key_value_pair in cases:
1809 msg = re.escape(template.format(
1810 repr(key_value_pair), type(key_value_pair).__name__
1811 ))
1812
1813 with pytest.raises(TypeError, match=msg):
1814 pa.array([
1815 [key_value_pair],
1816 [('a', 5), ('b', 'foo'), ('c', None)],
1817 ], type=ty)
1818
1819 with pytest.raises(TypeError, match=msg):
1820 pa.array([
1821 [('a', 5), ('b', 'foo'), ('c', None)],
1822 [key_value_pair],
1823 ], type=ty)
1824
1825
1826 def test_struct_from_mixed_sequence():
1827 # It is forbidden to mix dicts and tuples when initializing a struct array
1828 ty = pa.struct([pa.field('a', pa.int32()),
1829 pa.field('b', pa.string()),
1830 pa.field('c', pa.bool_())])
1831 data = [(5, 'foo', True),
1832 {'a': 6, 'b': 'bar', 'c': False}]
1833 with pytest.raises(TypeError):
1834 pa.array(data, type=ty)
1835
1836
1837 def test_struct_from_dicts_inference():
1838 expected_type = pa.struct([pa.field('a', pa.int64()),
1839 pa.field('b', pa.string()),
1840 pa.field('c', pa.bool_())])
1841 data = [{'a': 5, 'b': 'foo', 'c': True},
1842 {'a': 6, 'b': 'bar', 'c': False}]
1843
1844 arr = pa.array(data)
1845 check_struct_type(arr.type, expected_type)
1846 assert arr.to_pylist() == data
1847
1848 # With omitted values
1849 data = [{'a': 5, 'c': True},
1850 None,
1851 {},
1852 {'a': None, 'b': 'bar'}]
1853 expected = [{'a': 5, 'b': None, 'c': True},
1854 None,
1855 {'a': None, 'b': None, 'c': None},
1856 {'a': None, 'b': 'bar', 'c': None}]
1857
1858 arr = pa.array(data)
1859 data_as_ndarray = np.empty(len(data), dtype=object)
1860 data_as_ndarray[:] = data
1861 arr2 = pa.array(data)
1862
1863 check_struct_type(arr.type, expected_type)
1864 assert arr.to_pylist() == expected
1865 assert arr.equals(arr2)
1866
1867 # Nested
1868 expected_type = pa.struct([
1869 pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
1870 pa.field('ab', pa.bool_())])),
1871 pa.field('b', pa.string())])
1872 data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
1873 {'a': {'aa': None, 'ab': False}, 'b': None},
1874 {'a': None, 'b': 'bar'}]
1875 arr = pa.array(data)
1876
1877 assert arr.to_pylist() == data
1878
1879 # Edge cases
1880 arr = pa.array([{}])
1881 assert arr.type == pa.struct([])
1882 assert arr.to_pylist() == [{}]
1883
1884 # Mixing structs and scalars is rejected
1885 with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
1886 pa.array([1, {'a': 2}])
1887
1888
1889 def test_structarray_from_arrays_coerce():
1890 # ARROW-1706
1891 ints = [None, 2, 3]
1892 strs = ['a', None, 'c']
1893 bools = [True, False, None]
1894 ints_nonnull = [1, 2, 3]
1895
1896 arrays = [ints, strs, bools, ints_nonnull]
1897 result = pa.StructArray.from_arrays(arrays,
1898 ['ints', 'strs', 'bools',
1899 'int_nonnull'])
1900 expected = pa.StructArray.from_arrays(
1901 [pa.array(ints, type='int64'),
1902 pa.array(strs, type='utf8'),
1903 pa.array(bools),
1904 pa.array(ints_nonnull, type='int64')],
1905 ['ints', 'strs', 'bools', 'int_nonnull'])
1906
1907 with pytest.raises(ValueError):
1908 pa.StructArray.from_arrays(arrays)
1909
1910 assert result.equals(expected)
1911
1912
1913 def test_decimal_array_with_none_and_nan():
1914 values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
1915
1916 with pytest.raises(TypeError):
1917 # ARROW-6227: Without from_pandas=True, NaN is considered a float
1918 array = pa.array(values)
1919
1920 array = pa.array(values, from_pandas=True)
1921 assert array.type == pa.decimal128(4, 3)
1922 assert array.to_pylist() == values[:2] + [None, None]
1923
1924 array = pa.array(values, type=pa.decimal128(10, 4), from_pandas=True)
1925 assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
1926
1927
1928 def test_map_from_dicts():
1929 data = [[{'key': b'a', 'value': 1}, {'key': b'b', 'value': 2}],
1930 [{'key': b'c', 'value': 3}],
1931 [{'key': b'd', 'value': 4}, {'key': b'e', 'value': 5},
1932 {'key': b'f', 'value': None}],
1933 [{'key': b'g', 'value': 7}]]
1934 expected = [[(d['key'], d['value']) for d in entry] for entry in data]
1935
1936 arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1937
1938 assert arr.to_pylist() == expected
1939
1940 # With omitted values
1941 data[1] = None
1942 expected[1] = None
1943
1944 arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1945
1946 assert arr.to_pylist() == expected
1947
1948 # Invalid dictionary
1949 for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]:
1950 with pytest.raises(ValueError, match="Invalid Map"):
1951 pa.array([entry], type=pa.map_('i4', 'i4'))
1952
1953 # Invalid dictionary types
1954 for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
1955 with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
1956 pa.array([entry], type=pa.map_('i4', 'i4'))
1957
1958
1959 def test_map_from_tuples():
1960 expected = [[(b'a', 1), (b'b', 2)],
1961 [(b'c', 3)],
1962 [(b'd', 4), (b'e', 5), (b'f', None)],
1963 [(b'g', 7)]]
1964
1965 arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1966
1967 assert arr.to_pylist() == expected
1968
1969 # With omitted values
1970 expected[1] = None
1971
1972 arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1973
1974 assert arr.to_pylist() == expected
1975
1976 # Invalid tuple size
1977 for entry in [[(5,)], [()], [('5', 'foo', True)]]:
1978 with pytest.raises(ValueError, match="(?i)tuple size"):
1979 pa.array([entry], type=pa.map_('i4', 'i4'))
1980
1981
1982 def test_dictionary_from_boolean():
1983 typ = pa.dictionary(pa.int8(), value_type=pa.bool_())
1984 a = pa.array([False, False, True, False, True], type=typ)
1985 assert isinstance(a.type, pa.DictionaryType)
1986 assert a.type.equals(typ)
1987
1988 expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8())
1989 expected_dictionary = pa.array([False, True], type=pa.bool_())
1990 assert a.indices.equals(expected_indices)
1991 assert a.dictionary.equals(expected_dictionary)
1992
1993
1994 @pytest.mark.parametrize('value_type', [
1995 pa.int8(),
1996 pa.int16(),
1997 pa.int32(),
1998 pa.int64(),
1999 pa.uint8(),
2000 pa.uint16(),
2001 pa.uint32(),
2002 pa.uint64(),
2003 pa.float32(),
2004 pa.float64(),
2005 ])
2006 def test_dictionary_from_integers(value_type):
2007 typ = pa.dictionary(pa.int8(), value_type=value_type)
2008 a = pa.array([1, 2, 1, 1, 2, 3], type=typ)
2009 assert isinstance(a.type, pa.DictionaryType)
2010 assert a.type.equals(typ)
2011
2012 expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8())
2013 expected_dictionary = pa.array([1, 2, 3], type=value_type)
2014 assert a.indices.equals(expected_indices)
2015 assert a.dictionary.equals(expected_dictionary)
2016
2017
2018 @pytest.mark.parametrize('input_index_type', [
2019 pa.int8(),
2020 pa.int16(),
2021 pa.int32(),
2022 pa.int64()
2023 ])
2024 def test_dictionary_index_type(input_index_type):
2025 # dictionary array is constructed using adaptive index type builder,
2026 # but the input index type is considered as the minimal width type to use
2027
2028 typ = pa.dictionary(input_index_type, value_type=pa.int64())
2029 arr = pa.array(range(10), type=typ)
2030 assert arr.type.equals(typ)
2031
2032
2033 def test_dictionary_is_always_adaptive():
2034 # dictionary array is constructed using adaptive index type builder,
2035 # meaning that the output index type may be wider than the given index type
2036 # since it depends on the input data
2037 typ = pa.dictionary(pa.int8(), value_type=pa.int64())
2038
2039 a = pa.array(range(2**7), type=typ)
2040 expected = pa.dictionary(pa.int8(), pa.int64())
2041 assert a.type.equals(expected)
2042
2043 a = pa.array(range(2**7 + 1), type=typ)
2044 expected = pa.dictionary(pa.int16(), pa.int64())
2045 assert a.type.equals(expected)
2046
2047
2048 def test_dictionary_from_strings():
2049 for value_type in [pa.binary(), pa.string()]:
2050 typ = pa.dictionary(pa.int8(), value_type)
2051 a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ)
2052
2053 assert isinstance(a.type, pa.DictionaryType)
2054
2055 expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8())
2056 expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type)
2057 assert a.indices.equals(expected_indices)
2058 assert a.dictionary.equals(expected_dictionary)
2059
2060 # fixed size binary type
2061 typ = pa.dictionary(pa.int8(), pa.binary(3))
2062 a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ)
2063 assert isinstance(a.type, pa.DictionaryType)
2064
2065 expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8())
2066 expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3))
2067 assert a.indices.equals(expected_indices)
2068 assert a.dictionary.equals(expected_dictionary)
2069
2070
2071 @pytest.mark.parametrize(('unit', 'expected'), [
2072 ('s', datetime.timedelta(seconds=-2147483000)),
2073 ('ms', datetime.timedelta(milliseconds=-2147483000)),
2074 ('us', datetime.timedelta(microseconds=-2147483000)),
2075 ('ns', datetime.timedelta(microseconds=-2147483))
2076 ])
2077 def test_duration_array_roundtrip_corner_cases(unit, expected):
2078 # Corner case discovered by hypothesis: there were implicit conversions to
2079 # unsigned values resulting wrong values with wrong signs.
2080 ty = pa.duration(unit)
2081 arr = pa.array([-2147483000], type=ty)
2082 restored = pa.array(arr.to_pylist(), type=ty)
2083 assert arr.equals(restored)
2084
2085 expected_list = [expected]
2086 if unit == 'ns':
2087 # if pandas is available then a pandas Timedelta is returned
2088 try:
2089 import pandas as pd
2090 except ImportError:
2091 pass
2092 else:
2093 expected_list = [pd.Timedelta(-2147483000, unit='ns')]
2094
2095 assert restored.to_pylist() == expected_list
2096
2097
2098 @pytest.mark.pandas
2099 def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
2100 # corner case discovered by hypothesis: preserving the nanoseconds on
2101 # conversion from a list of Timedelta and Timestamp objects
2102 import pandas as pd
2103
2104 ty = pa.duration('ns')
2105 arr = pa.array([9223371273709551616], type=ty)
2106 data = arr.to_pylist()
2107 assert isinstance(data[0], pd.Timedelta)
2108 restored = pa.array(data, type=ty)
2109 assert arr.equals(restored)
2110 assert restored.to_pylist() == [
2111 pd.Timedelta(9223371273709551616, unit='ns')
2112 ]
2113
2114 ty = pa.timestamp('ns')
2115 arr = pa.array([9223371273709551616], type=ty)
2116 data = arr.to_pylist()
2117 assert isinstance(data[0], pd.Timestamp)
2118 restored = pa.array(data, type=ty)
2119 assert arr.equals(restored)
2120 assert restored.to_pylist() == [
2121 pd.Timestamp(9223371273709551616, unit='ns')
2122 ]
2123
2124 ty = pa.timestamp('ns', tz='US/Eastern')
2125 value = 1604119893000000000
2126 arr = pa.array([value], type=ty)
2127 data = arr.to_pylist()
2128 assert isinstance(data[0], pd.Timestamp)
2129 restored = pa.array(data, type=ty)
2130 assert arr.equals(restored)
2131 assert restored.to_pylist() == [
2132 pd.Timestamp(value, unit='ns').tz_localize(
2133 "UTC").tz_convert('US/Eastern')
2134 ]
2135
2136
2137 @h.given(past.all_arrays)
2138 def test_array_to_pylist_roundtrip(arr):
2139 seq = arr.to_pylist()
2140 restored = pa.array(seq, type=arr.type)
2141 assert restored.equals(arr)
2142
2143
2144 @pytest.mark.large_memory
2145 def test_auto_chunking_binary_like():
2146 # single chunk
2147 v1 = b'x' * 100000000
2148 v2 = b'x' * 147483646
2149
2150 # single chunk
2151 one_chunk_data = [v1] * 20 + [b'', None, v2]
2152 arr = pa.array(one_chunk_data, type=pa.binary())
2153 assert isinstance(arr, pa.Array)
2154 assert len(arr) == 23
2155 assert arr[20].as_py() == b''
2156 assert arr[21].as_py() is None
2157 assert arr[22].as_py() == v2
2158
2159 # two chunks
2160 two_chunk_data = one_chunk_data + [b'two']
2161 arr = pa.array(two_chunk_data, type=pa.binary())
2162 assert isinstance(arr, pa.ChunkedArray)
2163 assert arr.num_chunks == 2
2164 assert len(arr.chunk(0)) == 23
2165 assert len(arr.chunk(1)) == 1
2166 assert arr.chunk(0)[20].as_py() == b''
2167 assert arr.chunk(0)[21].as_py() is None
2168 assert arr.chunk(0)[22].as_py() == v2
2169 assert arr.chunk(1).to_pylist() == [b'two']
2170
2171 # three chunks
2172 three_chunk_data = one_chunk_data * 2 + [b'three', b'three']
2173 arr = pa.array(three_chunk_data, type=pa.binary())
2174 assert isinstance(arr, pa.ChunkedArray)
2175 assert arr.num_chunks == 3
2176 assert len(arr.chunk(0)) == 23
2177 assert len(arr.chunk(1)) == 23
2178 assert len(arr.chunk(2)) == 2
2179 for i in range(2):
2180 assert arr.chunk(i)[20].as_py() == b''
2181 assert arr.chunk(i)[21].as_py() is None
2182 assert arr.chunk(i)[22].as_py() == v2
2183 assert arr.chunk(2).to_pylist() == [b'three', b'three']
2184
2185
2186 @pytest.mark.large_memory
2187 def test_auto_chunking_list_of_binary():
2188 # ARROW-6281
2189 vals = [['x' * 1024]] * ((2 << 20) + 1)
2190 arr = pa.array(vals)
2191 assert isinstance(arr, pa.ChunkedArray)
2192 assert arr.num_chunks == 2
2193 assert len(arr.chunk(0)) == 2**21 - 1
2194 assert len(arr.chunk(1)) == 2
2195 assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2
2196
2197
2198 @pytest.mark.large_memory
2199 def test_auto_chunking_list_like():
2200 item = np.ones((2**28,), dtype='uint8')
2201 data = [item] * (2**3 - 1)
2202 arr = pa.array(data, type=pa.list_(pa.uint8()))
2203 assert isinstance(arr, pa.Array)
2204 assert len(arr) == 7
2205
2206 item = np.ones((2**28,), dtype='uint8')
2207 data = [item] * 2**3
2208 arr = pa.array(data, type=pa.list_(pa.uint8()))
2209 assert isinstance(arr, pa.ChunkedArray)
2210 assert arr.num_chunks == 2
2211 assert len(arr.chunk(0)) == 7
2212 assert len(arr.chunk(1)) == 1
2213 chunk = arr.chunk(1)
2214 scalar = chunk[0]
2215 assert isinstance(scalar, pa.ListScalar)
2216 expected = pa.array(item, type=pa.uint8())
2217 assert scalar.values == expected
2218
2219
2220 @pytest.mark.slow
2221 @pytest.mark.large_memory
2222 def test_auto_chunking_map_type():
2223 # takes ~20 minutes locally
2224 ty = pa.map_(pa.int8(), pa.int8())
2225 item = [(1, 1)] * 2**28
2226 data = [item] * 2**3
2227 arr = pa.array(data, type=ty)
2228 assert isinstance(arr, pa.ChunkedArray)
2229 assert len(arr.chunk(0)) == 7
2230 assert len(arr.chunk(1)) == 1
2231
2232
2233 @pytest.mark.large_memory
2234 @pytest.mark.parametrize(('ty', 'char'), [
2235 (pa.string(), 'x'),
2236 (pa.binary(), b'x'),
2237 ])
2238 def test_nested_auto_chunking(ty, char):
2239 v1 = char * 100000000
2240 v2 = char * 147483646
2241
2242 struct_type = pa.struct([
2243 pa.field('bool', pa.bool_()),
2244 pa.field('integer', pa.int64()),
2245 pa.field('string-like', ty),
2246 ])
2247
2248 data = [{'bool': True, 'integer': 1, 'string-like': v1}] * 20
2249 data.append({'bool': True, 'integer': 1, 'string-like': v2})
2250 arr = pa.array(data, type=struct_type)
2251 assert isinstance(arr, pa.Array)
2252
2253 data.append({'bool': True, 'integer': 1, 'string-like': char})
2254 arr = pa.array(data, type=struct_type)
2255 assert isinstance(arr, pa.ChunkedArray)
2256 assert arr.num_chunks == 2
2257 assert len(arr.chunk(0)) == 21
2258 assert len(arr.chunk(1)) == 1
2259 assert arr.chunk(1)[0].as_py() == {
2260 'bool': True,
2261 'integer': 1,
2262 'string-like': char
2263 }
2264
2265
2266 @pytest.mark.large_memory
2267 def test_array_from_pylist_data_overflow():
2268 # Regression test for ARROW-12983
2269 # Data buffer overflow - should result in chunked array
2270 items = [b'a' * 4096] * (2 ** 19)
2271 arr = pa.array(items, type=pa.string())
2272 assert isinstance(arr, pa.ChunkedArray)
2273 assert len(arr) == 2**19
2274 assert len(arr.chunks) > 1
2275
2276 mask = np.zeros(2**19, bool)
2277 arr = pa.array(items, mask=mask, type=pa.string())
2278 assert isinstance(arr, pa.ChunkedArray)
2279 assert len(arr) == 2**19
2280 assert len(arr.chunks) > 1
2281
2282 arr = pa.array(items, type=pa.binary())
2283 assert isinstance(arr, pa.ChunkedArray)
2284 assert len(arr) == 2**19
2285 assert len(arr.chunks) > 1
2286
2287
2288 @pytest.mark.slow
2289 @pytest.mark.large_memory
2290 def test_array_from_pylist_offset_overflow():
2291 # Regression test for ARROW-12983
2292 # Offset buffer overflow - should result in chunked array
2293 # Note this doesn't apply to primitive arrays
2294 items = [b'a'] * (2 ** 31)
2295 arr = pa.array(items, type=pa.string())
2296 assert isinstance(arr, pa.ChunkedArray)
2297 assert len(arr) == 2**31
2298 assert len(arr.chunks) > 1
2299
2300 mask = np.zeros(2**31, bool)
2301 arr = pa.array(items, mask=mask, type=pa.string())
2302 assert isinstance(arr, pa.ChunkedArray)
2303 assert len(arr) == 2**31
2304 assert len(arr.chunks) > 1
2305
2306 arr = pa.array(items, type=pa.binary())
2307 assert isinstance(arr, pa.ChunkedArray)
2308 assert len(arr) == 2**31
2309 assert len(arr.chunks) > 1