ceph/src/arrow/python/pyarrow/tests/test_convert_builtin.py

   1 # Licensed to the Apache Software Foundation (ASF) under one
   2 # or more contributor license agreements.  See the NOTICE file
   3 # distributed with this work for additional information
   4 # regarding copyright ownership.  The ASF licenses this file
   5 # to you under the Apache License, Version 2.0 (the
   6 # "License"); you may not use this file except in compliance
   7 # with the License.  You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing,
  12 # software distributed under the License is distributed on an
  13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 # KIND, either express or implied.  See the License for the
  15 # specific language governing permissions and limitations
  16 # under the License.
  17
  18 import collections
  19 import datetime
  20 import decimal
  21 import itertools
  22 import math
  23 import re
  24
  25 import hypothesis as h
  26 import numpy as np
  27 import pytz
  28 import pytest
  29
  30 from pyarrow.pandas_compat import _pandas_api  # noqa
  31 import pyarrow as pa
  32 import pyarrow.tests.strategies as past
  33
  34
  35 int_type_pairs = [
  36     (np.int8, pa.int8()),
  37     (np.int16, pa.int16()),
  38     (np.int32, pa.int32()),
  39     (np.int64, pa.int64()),
  40     (np.uint8, pa.uint8()),
  41     (np.uint16, pa.uint16()),
  42     (np.uint32, pa.uint32()),
  43     (np.uint64, pa.uint64())]
  44
  45
  46 np_int_types, pa_int_types = zip(*int_type_pairs)
  47
  48
  49 class StrangeIterable:
  50     def __init__(self, lst):
  51         self.lst = lst
  52
  53     def __iter__(self):
  54         return self.lst.__iter__()
  55
  56
  57 class MyInt:
  58     def __init__(self, value):
  59         self.value = value
  60
  61     def __int__(self):
  62         return self.value
  63
  64
  65 class MyBrokenInt:
  66     def __int__(self):
  67         1/0  # MARKER
  68
  69
  70 def check_struct_type(ty, expected):
  71     """
  72     Check a struct type is as expected, but not taking order into account.
  73     """
  74     assert pa.types.is_struct(ty)
  75     assert set(ty) == set(expected)
  76
  77
  78 def test_iterable_types():
  79     arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
  80     arr2 = pa.array((0, 1, 2, 3))
  81
  82     assert arr1.equals(arr2)
  83
  84
  85 def test_empty_iterable():
  86     arr = pa.array(StrangeIterable([]))
  87     assert len(arr) == 0
  88     assert arr.null_count == 0
  89     assert arr.type == pa.null()
  90     assert arr.to_pylist() == []
  91
  92
  93 def test_limited_iterator_types():
  94     arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
  95     arr2 = pa.array((0, 1, 2))
  96     assert arr1.equals(arr2)
  97
  98
  99 def test_limited_iterator_size_overflow():
 100     arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
 101     arr2 = pa.array((0, 1))
 102     assert arr1.equals(arr2)
 103
 104
 105 def test_limited_iterator_size_underflow():
 106     arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
 107     arr2 = pa.array((0, 1, 2))
 108     assert arr1.equals(arr2)
 109
 110
 111 def test_iterator_without_size():
 112     expected = pa.array((0, 1, 2))
 113     arr1 = pa.array(iter(range(3)))
 114     assert arr1.equals(expected)
 115     # Same with explicit type
 116     arr1 = pa.array(iter(range(3)), type=pa.int64())
 117     assert arr1.equals(expected)
 118
 119
 120 def test_infinite_iterator():
 121     expected = pa.array((0, 1, 2))
 122     arr1 = pa.array(itertools.count(0), size=3)
 123     assert arr1.equals(expected)
 124     # Same with explicit type
 125     arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3)
 126     assert arr1.equals(expected)
 127
 128
 129 def _as_list(xs):
 130     return xs
 131
 132
 133 def _as_tuple(xs):
 134     return tuple(xs)
 135
 136
 137 def _as_deque(xs):
 138     # deque is a sequence while neither tuple nor list
 139     return collections.deque(xs)
 140
 141
 142 def _as_dict_values(xs):
 143     # a dict values object is not a sequence, just a regular iterable
 144     dct = {k: v for k, v in enumerate(xs)}
 145     return dct.values()
 146
 147
 148 def _as_numpy_array(xs):
 149     arr = np.empty(len(xs), dtype=object)
 150     arr[:] = xs
 151     return arr
 152
 153
 154 def _as_set(xs):
 155     return set(xs)
 156
 157
 158 SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
 159 ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
 160 COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES
 161
 162 parametrize_with_iterable_types = pytest.mark.parametrize(
 163     "seq", ITERABLE_TYPES
 164 )
 165
 166 parametrize_with_sequence_types = pytest.mark.parametrize(
 167     "seq", SEQUENCE_TYPES
 168 )
 169
 170 parametrize_with_collections_types = pytest.mark.parametrize(
 171     "seq", COLLECTIONS_TYPES
 172 )
 173
 174
 175 @parametrize_with_collections_types
 176 def test_sequence_types(seq):
 177     arr1 = pa.array(seq([1, 2, 3]))
 178     arr2 = pa.array([1, 2, 3])
 179
 180     assert arr1.equals(arr2)
 181
 182
 183 @parametrize_with_iterable_types
 184 def test_nested_sequence_types(seq):
 185     arr1 = pa.array([seq([1, 2, 3])])
 186     arr2 = pa.array([[1, 2, 3]])
 187
 188     assert arr1.equals(arr2)
 189
 190
 191 @parametrize_with_sequence_types
 192 def test_sequence_boolean(seq):
 193     expected = [True, None, False, None]
 194     arr = pa.array(seq(expected))
 195     assert len(arr) == 4
 196     assert arr.null_count == 2
 197     assert arr.type == pa.bool_()
 198     assert arr.to_pylist() == expected
 199
 200
 201 @parametrize_with_sequence_types
 202 def test_sequence_numpy_boolean(seq):
 203     expected = [np.bool_(True), None, np.bool_(False), None]
 204     arr = pa.array(seq(expected))
 205     assert arr.type == pa.bool_()
 206     assert arr.to_pylist() == [True, None, False, None]
 207
 208
 209 @parametrize_with_sequence_types
 210 def test_sequence_mixed_numpy_python_bools(seq):
 211     values = np.array([True, False])
 212     arr = pa.array(seq([values[0], None, values[1], True, False]))
 213     assert arr.type == pa.bool_()
 214     assert arr.to_pylist() == [True, None, False, True, False]
 215
 216
 217 @parametrize_with_collections_types
 218 def test_empty_list(seq):
 219     arr = pa.array(seq([]))
 220     assert len(arr) == 0
 221     assert arr.null_count == 0
 222     assert arr.type == pa.null()
 223     assert arr.to_pylist() == []
 224
 225
 226 @parametrize_with_sequence_types
 227 def test_nested_lists(seq):
 228     data = [[], [1, 2], None]
 229     arr = pa.array(seq(data))
 230     assert len(arr) == 3
 231     assert arr.null_count == 1
 232     assert arr.type == pa.list_(pa.int64())
 233     assert arr.to_pylist() == data
 234     # With explicit type
 235     arr = pa.array(seq(data), type=pa.list_(pa.int32()))
 236     assert len(arr) == 3
 237     assert arr.null_count == 1
 238     assert arr.type == pa.list_(pa.int32())
 239     assert arr.to_pylist() == data
 240
 241
 242 @parametrize_with_sequence_types
 243 def test_nested_large_lists(seq):
 244     data = [[], [1, 2], None]
 245     arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
 246     assert len(arr) == 3
 247     assert arr.null_count == 1
 248     assert arr.type == pa.large_list(pa.int16())
 249     assert arr.to_pylist() == data
 250
 251
 252 @parametrize_with_collections_types
 253 def test_list_with_non_list(seq):
 254     # List types don't accept non-sequences
 255     with pytest.raises(TypeError):
 256         pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
 257     with pytest.raises(TypeError):
 258         pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
 259
 260
 261 @parametrize_with_sequence_types
 262 def test_nested_arrays(seq):
 263     arr = pa.array(seq([np.array([], dtype=np.int64),
 264                         np.array([1, 2], dtype=np.int64), None]))
 265     assert len(arr) == 3
 266     assert arr.null_count == 1
 267     assert arr.type == pa.list_(pa.int64())
 268     assert arr.to_pylist() == [[], [1, 2], None]
 269
 270
 271 @parametrize_with_sequence_types
 272 def test_nested_fixed_size_list(seq):
 273     # sequence of lists
 274     data = [[1, 2], [3, None], None]
 275     arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2))
 276     assert len(arr) == 3
 277     assert arr.null_count == 1
 278     assert arr.type == pa.list_(pa.int64(), 2)
 279     assert arr.to_pylist() == data
 280
 281     # sequence of numpy arrays
 282     data = [np.array([1, 2], dtype='int64'), np.array([3, 4], dtype='int64'),
 283             None]
 284     arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2))
 285     assert len(arr) == 3
 286     assert arr.null_count == 1
 287     assert arr.type == pa.list_(pa.int64(), 2)
 288     assert arr.to_pylist() == [[1, 2], [3, 4], None]
 289
 290     # incorrect length of the lists or arrays
 291     data = [[1, 2, 4], [3, None], None]
 292     for data in [[[1, 2, 3]], [np.array([1, 2, 4], dtype='int64')]]:
 293         with pytest.raises(
 294                 ValueError, match="Length of item not correct: expected 2"):
 295             pa.array(seq(data), type=pa.list_(pa.int64(), 2))
 296
 297     # with list size of 0
 298     data = [[], [], None]
 299     arr = pa.array(seq(data), type=pa.list_(pa.int64(), 0))
 300     assert len(arr) == 3
 301     assert arr.null_count == 1
 302     assert arr.type == pa.list_(pa.int64(), 0)
 303     assert arr.to_pylist() == [[], [], None]
 304
 305
 306 @parametrize_with_sequence_types
 307 def test_sequence_all_none(seq):
 308     arr = pa.array(seq([None, None]))
 309     assert len(arr) == 2
 310     assert arr.null_count == 2
 311     assert arr.type == pa.null()
 312     assert arr.to_pylist() == [None, None]
 313
 314
 315 @parametrize_with_sequence_types
 316 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 317 def test_sequence_integer(seq, np_scalar_pa_type):
 318     np_scalar, pa_type = np_scalar_pa_type
 319     expected = [1, None, 3, None,
 320                 np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
 321     arr = pa.array(seq(expected), type=pa_type)
 322     assert len(arr) == 6
 323     assert arr.null_count == 2
 324     assert arr.type == pa_type
 325     assert arr.to_pylist() == expected
 326
 327
 328 @parametrize_with_collections_types
 329 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 330 def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
 331     # ARROW-2806: numpy.nan is a double value and thus should produce
 332     # a double array.
 333     _, pa_type = np_scalar_pa_type
 334     with pytest.raises(ValueError):
 335         pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
 336
 337     arr = pa.array(seq([np.nan]), type=pa_type, from_pandas=True)
 338     expected = [None]
 339     assert len(arr) == 1
 340     assert arr.null_count == 1
 341     assert arr.type == pa_type
 342     assert arr.to_pylist() == expected
 343
 344
 345 @parametrize_with_sequence_types
 346 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 347 def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
 348     # ARROW-2806: numpy.nan is a double value and thus should produce
 349     # a double array.
 350     _, pa_type = np_scalar_pa_type
 351     with pytest.raises(ValueError):
 352         pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
 353
 354     arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True)
 355     expected = [[None]]
 356     assert len(arr) == 1
 357     assert arr.null_count == 0
 358     assert arr.type == pa.list_(pa_type)
 359     assert arr.to_pylist() == expected
 360
 361
 362 @parametrize_with_sequence_types
 363 def test_sequence_integer_inferred(seq):
 364     expected = [1, None, 3, None]
 365     arr = pa.array(seq(expected))
 366     assert len(arr) == 4
 367     assert arr.null_count == 2
 368     assert arr.type == pa.int64()
 369     assert arr.to_pylist() == expected
 370
 371
 372 @parametrize_with_sequence_types
 373 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 374 def test_sequence_numpy_integer(seq, np_scalar_pa_type):
 375     np_scalar, pa_type = np_scalar_pa_type
 376     expected = [np_scalar(1), None, np_scalar(3), None,
 377                 np_scalar(np.iinfo(np_scalar).min),
 378                 np_scalar(np.iinfo(np_scalar).max)]
 379     arr = pa.array(seq(expected), type=pa_type)
 380     assert len(arr) == 6
 381     assert arr.null_count == 2
 382     assert arr.type == pa_type
 383     assert arr.to_pylist() == expected
 384
 385
 386 @parametrize_with_sequence_types
 387 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 388 def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
 389     np_scalar, pa_type = np_scalar_pa_type
 390     expected = [np_scalar(1), None, np_scalar(3), None]
 391     expected += [np_scalar(np.iinfo(np_scalar).min),
 392                  np_scalar(np.iinfo(np_scalar).max)]
 393     arr = pa.array(seq(expected))
 394     assert len(arr) == 6
 395     assert arr.null_count == 2
 396     assert arr.type == pa_type
 397     assert arr.to_pylist() == expected
 398
 399
 400 @parametrize_with_sequence_types
 401 def test_sequence_custom_integers(seq):
 402     expected = [0, 42, 2**33 + 1, -2**63]
 403     data = list(map(MyInt, expected))
 404     arr = pa.array(seq(data), type=pa.int64())
 405     assert arr.to_pylist() == expected
 406
 407
 408 @parametrize_with_collections_types
 409 def test_broken_integers(seq):
 410     data = [MyBrokenInt()]
 411     with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
 412         pa.array(seq(data), type=pa.int64())
 413
 414
 415 def test_numpy_scalars_mixed_type():
 416     # ARROW-4324
 417     data = [np.int32(10), np.float32(0.5)]
 418     arr = pa.array(data)
 419     expected = pa.array([10, 0.5], type="float64")
 420     assert arr.equals(expected)
 421
 422     # ARROW-9490
 423     data = [np.int8(10), np.float32(0.5)]
 424     arr = pa.array(data)
 425     expected = pa.array([10, 0.5], type="float32")
 426     assert arr.equals(expected)
 427
 428
 429 @pytest.mark.xfail(reason="Type inference for uint64 not implemented",
 430                    raises=OverflowError)
 431 def test_uint64_max_convert():
 432     data = [0, np.iinfo(np.uint64).max]
 433
 434     arr = pa.array(data, type=pa.uint64())
 435     expected = pa.array(np.array(data, dtype='uint64'))
 436     assert arr.equals(expected)
 437
 438     arr_inferred = pa.array(data)
 439     assert arr_inferred.equals(expected)
 440
 441
 442 @pytest.mark.parametrize("bits", [8, 16, 32, 64])
 443 def test_signed_integer_overflow(bits):
 444     ty = getattr(pa, "int%d" % bits)()
 445     # XXX ideally would always raise OverflowError
 446     with pytest.raises((OverflowError, pa.ArrowInvalid)):
 447         pa.array([2 ** (bits - 1)], ty)
 448     with pytest.raises((OverflowError, pa.ArrowInvalid)):
 449         pa.array([-2 ** (bits - 1) - 1], ty)
 450
 451
 452 @pytest.mark.parametrize("bits", [8, 16, 32, 64])
 453 def test_unsigned_integer_overflow(bits):
 454     ty = getattr(pa, "uint%d" % bits)()
 455     # XXX ideally would always raise OverflowError
 456     with pytest.raises((OverflowError, pa.ArrowInvalid)):
 457         pa.array([2 ** bits], ty)
 458     with pytest.raises((OverflowError, pa.ArrowInvalid)):
 459         pa.array([-1], ty)
 460
 461
 462 @parametrize_with_collections_types
 463 @pytest.mark.parametrize("typ", pa_int_types)
 464 def test_integer_from_string_error(seq, typ):
 465     # ARROW-9451: pa.array(['1'], type=pa.uint32()) should not succeed
 466     with pytest.raises(pa.ArrowInvalid):
 467         pa.array(seq(['1']), type=typ)
 468
 469
 470 def test_convert_with_mask():
 471     data = [1, 2, 3, 4, 5]
 472     mask = np.array([False, True, False, False, True])
 473
 474     result = pa.array(data, mask=mask)
 475     expected = pa.array([1, None, 3, 4, None])
 476
 477     assert result.equals(expected)
 478
 479     # Mask wrong length
 480     with pytest.raises(ValueError):
 481         pa.array(data, mask=mask[1:])
 482
 483
 484 def test_garbage_collection():
 485     import gc
 486
 487     # Force the cyclic garbage collector to run
 488     gc.collect()
 489
 490     bytes_before = pa.total_allocated_bytes()
 491     pa.array([1, None, 3, None])
 492     gc.collect()
 493     assert pa.total_allocated_bytes() == bytes_before
 494
 495
 496 def test_sequence_double():
 497     data = [1.5, 1., None, 2.5, None, None]
 498     arr = pa.array(data)
 499     assert len(arr) == 6
 500     assert arr.null_count == 3
 501     assert arr.type == pa.float64()
 502     assert arr.to_pylist() == data
 503
 504
 505 def test_double_auto_coerce_from_integer():
 506     # Done as part of ARROW-2814
 507     data = [1.5, 1., None, 2.5, None, None]
 508     arr = pa.array(data)
 509
 510     data2 = [1.5, 1, None, 2.5, None, None]
 511     arr2 = pa.array(data2)
 512
 513     assert arr.equals(arr2)
 514
 515     data3 = [1, 1.5, None, 2.5, None, None]
 516     arr3 = pa.array(data3)
 517
 518     data4 = [1., 1.5, None, 2.5, None, None]
 519     arr4 = pa.array(data4)
 520
 521     assert arr3.equals(arr4)
 522
 523
 524 def test_double_integer_coerce_representable_range():
 525     valid_values = [1.5, 1, 2, None, 1 << 53, -(1 << 53)]
 526     invalid_values = [1.5, 1, 2, None, (1 << 53) + 1]
 527     invalid_values2 = [1.5, 1, 2, None, -((1 << 53) + 1)]
 528
 529     # it works
 530     pa.array(valid_values)
 531
 532     # it fails
 533     with pytest.raises(ValueError):
 534         pa.array(invalid_values)
 535
 536     with pytest.raises(ValueError):
 537         pa.array(invalid_values2)
 538
 539
 540 def test_float32_integer_coerce_representable_range():
 541     f32 = np.float32
 542     valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
 543     invalid_values = [f32(1.5), (1 << 24) + 1]
 544     invalid_values2 = [f32(1.5), -((1 << 24) + 1)]
 545
 546     # it works
 547     pa.array(valid_values, type=pa.float32())
 548
 549     # it fails
 550     with pytest.raises(ValueError):
 551         pa.array(invalid_values, type=pa.float32())
 552
 553     with pytest.raises(ValueError):
 554         pa.array(invalid_values2, type=pa.float32())
 555
 556
 557 def test_mixed_sequence_errors():
 558     with pytest.raises(ValueError, match="tried to convert to boolean"):
 559         pa.array([True, 'foo'], type=pa.bool_())
 560
 561     with pytest.raises(ValueError, match="tried to convert to float32"):
 562         pa.array([1.5, 'foo'], type=pa.float32())
 563
 564     with pytest.raises(ValueError, match="tried to convert to double"):
 565         pa.array([1.5, 'foo'])
 566
 567
 568 @parametrize_with_sequence_types
 569 @pytest.mark.parametrize("np_scalar,pa_type", [
 570     (np.float16, pa.float16()),
 571     (np.float32, pa.float32()),
 572     (np.float64, pa.float64())
 573 ])
 574 @pytest.mark.parametrize("from_pandas", [True, False])
 575 def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
 576     data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
 577     arr = pa.array(seq(data), from_pandas=from_pandas)
 578     assert len(arr) == 6
 579     if from_pandas:
 580         assert arr.null_count == 3
 581     else:
 582         assert arr.null_count == 2
 583     if from_pandas:
 584         # The NaN is skipped in type inference, otherwise it forces a
 585         # float64 promotion
 586         assert arr.type == pa_type
 587     else:
 588         assert arr.type == pa.float64()
 589
 590     assert arr.to_pylist()[:4] == data[:4]
 591     if from_pandas:
 592         assert arr.to_pylist()[5] is None
 593     else:
 594         assert np.isnan(arr.to_pylist()[5])
 595
 596
 597 @pytest.mark.parametrize("from_pandas", [True, False])
 598 @pytest.mark.parametrize("inner_seq", [np.array, list])
 599 def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
 600     # ARROW-2806
 601     data = np.array([
 602         inner_seq([1., 2.]),
 603         inner_seq([1., 2., 3.]),
 604         inner_seq([np.nan]),
 605         None
 606     ], dtype=object)
 607     arr = pa.array(data, from_pandas=from_pandas)
 608     assert len(arr) == 4
 609     assert arr.null_count == 1
 610     assert arr.type == pa.list_(pa.float64())
 611     if from_pandas:
 612         assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
 613     else:
 614         np.testing.assert_equal(arr.to_pylist(),
 615                                 [[1., 2.], [1., 2., 3.], [np.nan], None])
 616
 617
 618 def test_nested_ndarray_in_object_array():
 619     # ARROW-4350
 620     arr = np.empty(2, dtype=object)
 621     arr[:] = [np.array([1, 2], dtype=np.int64),
 622               np.array([2, 3], dtype=np.int64)]
 623
 624     arr2 = np.empty(2, dtype=object)
 625     arr2[0] = [3, 4]
 626     arr2[1] = [5, 6]
 627
 628     expected_type = pa.list_(pa.list_(pa.int64()))
 629     assert pa.infer_type([arr]) == expected_type
 630
 631     result = pa.array([arr, arr2])
 632     expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
 633                         type=expected_type)
 634
 635     assert result.equals(expected)
 636
 637     # test case for len-1 arrays to ensure they are interpreted as
 638     # sublists and not scalars
 639     arr = np.empty(2, dtype=object)
 640     arr[:] = [np.array([1]), np.array([2])]
 641     result = pa.array([arr, arr])
 642     assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
 643
 644
 645 @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
 646                            "not yet implemented"),
 647                    raises=AssertionError)
 648 def test_multidimensional_ndarray_as_nested_list():
 649     # TODO(wesm): see ARROW-5645
 650     arr = np.array([[1, 2], [2, 3]], dtype=np.int64)
 651     arr2 = np.array([[3, 4], [5, 6]], dtype=np.int64)
 652
 653     expected_type = pa.list_(pa.list_(pa.int64()))
 654     assert pa.infer_type([arr]) == expected_type
 655
 656     result = pa.array([arr, arr2])
 657     expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
 658                         type=expected_type)
 659
 660     assert result.equals(expected)
 661
 662
 663 @pytest.mark.parametrize(('data', 'value_type'), [
 664     ([True, False], pa.bool_()),
 665     ([None, None], pa.null()),
 666     ([1, 2, None], pa.int8()),
 667     ([1, 2., 3., None], pa.float32()),
 668     ([datetime.date.today(), None], pa.date32()),
 669     ([None, datetime.date.today()], pa.date64()),
 670     ([datetime.time(1, 1, 1), None], pa.time32('s')),
 671     ([None, datetime.time(2, 2, 2)], pa.time64('us')),
 672     ([datetime.datetime.now(), None], pa.timestamp('us')),
 673     ([datetime.timedelta(seconds=10)], pa.duration('s')),
 674     ([b"a", b"b"], pa.binary()),
 675     ([b"aaa", b"bbb", b"ccc"], pa.binary(3)),
 676     ([b"a", b"b", b"c"], pa.large_binary()),
 677     (["a", "b", "c"], pa.string()),
 678     (["a", "b", "c"], pa.large_string()),
 679     (
 680         [{"a": 1, "b": 2}, None, {"a": 5, "b": None}],
 681         pa.struct([('a', pa.int8()), ('b', pa.int16())])
 682     )
 683 ])
 684 def test_list_array_from_object_ndarray(data, value_type):
 685     ty = pa.list_(value_type)
 686     ndarray = np.array(data, dtype=object)
 687     arr = pa.array([ndarray], type=ty)
 688     assert arr.type.equals(ty)
 689     assert arr.to_pylist() == [data]
 690
 691
 692 @pytest.mark.parametrize(('data', 'value_type'), [
 693     ([[1, 2], [3]], pa.list_(pa.int64())),
 694     ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
 695     ([[1], [2, 3]], pa.large_list(pa.int64()))
 696 ])
 697 def test_nested_list_array_from_object_ndarray(data, value_type):
 698     ndarray = np.empty(len(data), dtype=object)
 699     ndarray[:] = [np.array(item, dtype=object) for item in data]
 700
 701     ty = pa.list_(value_type)
 702     arr = pa.array([ndarray], type=ty)
 703     assert arr.type.equals(ty)
 704     assert arr.to_pylist() == [data]
 705
 706
 707 def test_array_ignore_nan_from_pandas():
 708     # See ARROW-4324, this reverts logic that was introduced in
 709     # ARROW-2240
 710     with pytest.raises(ValueError):
 711         pa.array([np.nan, 'str'])
 712
 713     arr = pa.array([np.nan, 'str'], from_pandas=True)
 714     expected = pa.array([None, 'str'])
 715     assert arr.equals(expected)
 716
 717
 718 def test_nested_ndarray_different_dtypes():
 719     data = [
 720         np.array([1, 2, 3], dtype='int64'),
 721         None,
 722         np.array([4, 5, 6], dtype='uint32')
 723     ]
 724
 725     arr = pa.array(data)
 726     expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
 727                         type=pa.list_(pa.int64()))
 728     assert arr.equals(expected)
 729
 730     t2 = pa.list_(pa.uint32())
 731     arr2 = pa.array(data, type=t2)
 732     expected2 = expected.cast(t2)
 733     assert arr2.equals(expected2)
 734
 735
 736 def test_sequence_unicode():
 737     data = ['foo', 'bar', None, 'mañana']
 738     arr = pa.array(data)
 739     assert len(arr) == 4
 740     assert arr.null_count == 1
 741     assert arr.type == pa.string()
 742     assert arr.to_pylist() == data
 743
 744
 745 def check_array_mixed_unicode_bytes(binary_type, string_type):
 746     values = ['qux', b'foo', bytearray(b'barz')]
 747     b_values = [b'qux', b'foo', b'barz']
 748     u_values = ['qux', 'foo', 'barz']
 749
 750     arr = pa.array(values)
 751     expected = pa.array(b_values, type=pa.binary())
 752     assert arr.type == pa.binary()
 753     assert arr.equals(expected)
 754
 755     arr = pa.array(values, type=binary_type)
 756     expected = pa.array(b_values, type=binary_type)
 757     assert arr.type == binary_type
 758     assert arr.equals(expected)
 759
 760     arr = pa.array(values, type=string_type)
 761     expected = pa.array(u_values, type=string_type)
 762     assert arr.type == string_type
 763     assert arr.equals(expected)
 764
 765
 766 def test_array_mixed_unicode_bytes():
 767     check_array_mixed_unicode_bytes(pa.binary(), pa.string())
 768     check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
 769
 770
 771 @pytest.mark.large_memory
 772 @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
 773 def test_large_binary_array(ty):
 774     # Construct a large binary array with more than 4GB of data
 775     s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10
 776     nrepeats = math.ceil((2**32 + 5) / len(s))
 777     data = [s] * nrepeats
 778     arr = pa.array(data, type=ty)
 779     assert isinstance(arr, pa.Array)
 780     assert arr.type == ty
 781     assert len(arr) == nrepeats
 782
 783
 784 @pytest.mark.slow
 785 @pytest.mark.large_memory
 786 @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
 787 def test_large_binary_value(ty):
 788     # Construct a large binary array with a single value larger than 4GB
 789     s = b"0123456789abcdefghijklmnopqrstuvwxyz"
 790     nrepeats = math.ceil((2**32 + 5) / len(s))
 791     arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
 792     assert isinstance(arr, pa.Array)
 793     assert arr.type == ty
 794     assert len(arr) == 4
 795     buf = arr[1].as_buffer()
 796     assert len(buf) == len(s) * nrepeats
 797
 798
 799 @pytest.mark.large_memory
 800 @pytest.mark.parametrize("ty", [pa.binary(), pa.string()])
 801 def test_string_too_large(ty):
 802     # Construct a binary array with a single value larger than 4GB
 803     s = b"0123456789abcdefghijklmnopqrstuvwxyz"
 804     nrepeats = math.ceil((2**32 + 5) / len(s))
 805     with pytest.raises(pa.ArrowCapacityError):
 806         pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
 807
 808
 809 def test_sequence_bytes():
 810     u1 = b'ma\xc3\xb1ana'
 811
 812     data = [b'foo',
 813             memoryview(b'dada'),
 814             memoryview(b'd-a-t-a')[::2],  # non-contiguous is made contiguous
 815             u1.decode('utf-8'),  # unicode gets encoded,
 816             bytearray(b'bar'),
 817             None]
 818     for ty in [None, pa.binary(), pa.large_binary()]:
 819         arr = pa.array(data, type=ty)
 820         assert len(arr) == 6
 821         assert arr.null_count == 1
 822         assert arr.type == ty or pa.binary()
 823         assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None]
 824
 825
 826 @pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
 827 def test_sequence_utf8_to_unicode(ty):
 828     # ARROW-1225
 829     data = [b'foo', None, b'bar']
 830     arr = pa.array(data, type=ty)
 831     assert arr.type == ty
 832     assert arr[0].as_py() == 'foo'
 833
 834     # test a non-utf8 unicode string
 835     val = ('mañana').encode('utf-16-le')
 836     with pytest.raises(pa.ArrowInvalid):
 837         pa.array([val], type=ty)
 838
 839
 840 def test_sequence_fixed_size_bytes():
 841     data = [b'foof', None, bytearray(b'barb'), b'2346']
 842     arr = pa.array(data, type=pa.binary(4))
 843     assert len(arr) == 4
 844     assert arr.null_count == 1
 845     assert arr.type == pa.binary(4)
 846     assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
 847
 848
 849 def test_fixed_size_bytes_does_not_accept_varying_lengths():
 850     data = [b'foo', None, b'barb', b'2346']
 851     with pytest.raises(pa.ArrowInvalid):
 852         pa.array(data, type=pa.binary(4))
 853
 854
 855 def test_fixed_size_binary_length_check():
 856     # ARROW-10193
 857     data = [b'\x19h\r\x9e\x00\x00\x00\x00\x01\x9b\x9fA']
 858     assert len(data[0]) == 12
 859     ty = pa.binary(12)
 860     arr = pa.array(data, type=ty)
 861     assert arr.to_pylist() == data
 862
 863
 864 def test_sequence_date():
 865     data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
 866             datetime.date(2040, 2, 26)]
 867     arr = pa.array(data)
 868     assert len(arr) == 4
 869     assert arr.type == pa.date32()
 870     assert arr.null_count == 1
 871     assert arr[0].as_py() == datetime.date(2000, 1, 1)
 872     assert arr[1].as_py() is None
 873     assert arr[2].as_py() == datetime.date(1970, 1, 1)
 874     assert arr[3].as_py() == datetime.date(2040, 2, 26)
 875
 876
 877 @pytest.mark.parametrize('input',
 878                          [(pa.date32(), [10957, None]),
 879                           (pa.date64(), [10957 * 86400000, None])])
 880 def test_sequence_explicit_types(input):
 881     t, ex_values = input
 882     data = [datetime.date(2000, 1, 1), None]
 883     arr = pa.array(data, type=t)
 884     arr2 = pa.array(ex_values, type=t)
 885
 886     for x in [arr, arr2]:
 887         assert len(x) == 2
 888         assert x.type == t
 889         assert x.null_count == 1
 890         assert x[0].as_py() == datetime.date(2000, 1, 1)
 891         assert x[1].as_py() is None
 892
 893
 894 def test_date32_overflow():
 895     # Overflow
 896     data3 = [2**32, None]
 897     with pytest.raises((OverflowError, pa.ArrowException)):
 898         pa.array(data3, type=pa.date32())
 899
 900
 901 @pytest.mark.parametrize(('time_type', 'unit', 'int_type'), [
 902     (pa.time32, 's', 'int32'),
 903     (pa.time32, 'ms', 'int32'),
 904     (pa.time64, 'us', 'int64'),
 905     (pa.time64, 'ns', 'int64'),
 906 ])
 907 def test_sequence_time_with_timezone(time_type, unit, int_type):
 908     def expected_integer_value(t):
 909         # only use with utc time object because it doesn't adjust with the
 910         # offset
 911         units = ['s', 'ms', 'us', 'ns']
 912         multiplier = 10**(units.index(unit) * 3)
 913         if t is None:
 914             return None
 915         seconds = (
 916             t.hour * 3600 +
 917             t.minute * 60 +
 918             t.second +
 919             t.microsecond * 10**-6
 920         )
 921         return int(seconds * multiplier)
 922
 923     def expected_time_value(t):
 924         # only use with utc time object because it doesn't adjust with the
 925         # time objects tzdata
 926         if unit == 's':
 927             return t.replace(microsecond=0)
 928         elif unit == 'ms':
 929             return t.replace(microsecond=(t.microsecond // 1000) * 1000)
 930         else:
 931             return t
 932
 933     # only timezone naive times are supported in arrow
 934     data = [
 935         datetime.time(8, 23, 34, 123456),
 936         datetime.time(5, 0, 0, 1000),
 937         None,
 938         datetime.time(1, 11, 56, 432539),
 939         datetime.time(23, 10, 0, 437699)
 940     ]
 941
 942     ty = time_type(unit)
 943     arr = pa.array(data, type=ty)
 944     assert len(arr) == 5
 945     assert arr.type == ty
 946     assert arr.null_count == 1
 947
 948     # test that the underlying integers are UTC values
 949     values = arr.cast(int_type)
 950     expected = list(map(expected_integer_value, data))
 951     assert values.to_pylist() == expected
 952
 953     # test that the scalars are datetime.time objects with UTC timezone
 954     assert arr[0].as_py() == expected_time_value(data[0])
 955     assert arr[1].as_py() == expected_time_value(data[1])
 956     assert arr[2].as_py() is None
 957     assert arr[3].as_py() == expected_time_value(data[3])
 958     assert arr[4].as_py() == expected_time_value(data[4])
 959
 960     def tz(hours, minutes=0):
 961         offset = datetime.timedelta(hours=hours, minutes=minutes)
 962         return datetime.timezone(offset)
 963
 964
 965 def test_sequence_timestamp():
 966     data = [
 967         datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
 968         None,
 969         datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
 970         datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
 971     ]
 972     arr = pa.array(data)
 973     assert len(arr) == 4
 974     assert arr.type == pa.timestamp('us')
 975     assert arr.null_count == 1
 976     assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
 977                                                23, 34, 123456)
 978     assert arr[1].as_py() is None
 979     assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
 980                                                34, 56, 432539)
 981     assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
 982                                                46, 57, 437699)
 983
 984
 985 @pytest.mark.parametrize('timezone', [
 986     None,
 987     'UTC',
 988     'Etc/GMT-1',
 989     'Europe/Budapest',
 990 ])
 991 @pytest.mark.parametrize('unit', [
 992     's',
 993     'ms',
 994     'us',
 995     'ns'
 996 ])
 997 def test_sequence_timestamp_with_timezone(timezone, unit):
 998     def expected_integer_value(dt):
 999         units = ['s', 'ms', 'us', 'ns']
1000         multiplier = 10**(units.index(unit) * 3)
1001         if dt is None:
1002             return None
1003         else:
1004             # avoid float precision issues
1005             ts = decimal.Decimal(str(dt.timestamp()))
1006             return int(ts * multiplier)
1007
1008     def expected_datetime_value(dt):
1009         if dt is None:
1010             return None
1011
1012         if unit == 's':
1013             dt = dt.replace(microsecond=0)
1014         elif unit == 'ms':
1015             dt = dt.replace(microsecond=(dt.microsecond // 1000) * 1000)
1016
1017         # adjust the timezone
1018         if timezone is None:
1019             # make datetime timezone unaware
1020             return dt.replace(tzinfo=None)
1021         else:
1022             # convert to the expected timezone
1023             return dt.astimezone(pytz.timezone(timezone))
1024
1025     data = [
1026         datetime.datetime(2007, 7, 13, 8, 23, 34, 123456),  # naive
1027         pytz.utc.localize(
1028             datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1029         ),
1030         None,
1031         pytz.timezone('US/Eastern').localize(
1032             datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1033         ),
1034         pytz.timezone('Europe/Moscow').localize(
1035             datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1036         ),
1037     ]
1038     utcdata = [
1039         pytz.utc.localize(data[0]),
1040         data[1],
1041         None,
1042         data[3].astimezone(pytz.utc),
1043         data[4].astimezone(pytz.utc),
1044     ]
1045
1046     ty = pa.timestamp(unit, tz=timezone)
1047     arr = pa.array(data, type=ty)
1048     assert len(arr) == 5
1049     assert arr.type == ty
1050     assert arr.null_count == 1
1051
1052     # test that the underlying integers are UTC values
1053     values = arr.cast('int64')
1054     expected = list(map(expected_integer_value, utcdata))
1055     assert values.to_pylist() == expected
1056
1057     # test that the scalars are datetimes with the correct timezone
1058     for i in range(len(arr)):
1059         assert arr[i].as_py() == expected_datetime_value(utcdata[i])
1060
1061
1062 @pytest.mark.parametrize('timezone', [
1063     None,
1064     'UTC',
1065     'Etc/GMT-1',
1066     'Europe/Budapest',
1067 ])
1068 def test_pyarrow_ignore_timezone_environment_variable(monkeypatch, timezone):
1069     # note that any non-empty value will evaluate to true
1070     monkeypatch.setenv("PYARROW_IGNORE_TIMEZONE", "1")
1071     data = [
1072         datetime.datetime(2007, 7, 13, 8, 23, 34, 123456),  # naive
1073         pytz.utc.localize(
1074             datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1075         ),
1076         pytz.timezone('US/Eastern').localize(
1077             datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1078         ),
1079         pytz.timezone('Europe/Moscow').localize(
1080             datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1081         ),
1082     ]
1083
1084     expected = [dt.replace(tzinfo=None) for dt in data]
1085     if timezone is not None:
1086         tzinfo = pytz.timezone(timezone)
1087         expected = [tzinfo.fromutc(dt) for dt in expected]
1088
1089     ty = pa.timestamp('us', tz=timezone)
1090     arr = pa.array(data, type=ty)
1091     assert arr.to_pylist() == expected
1092
1093
1094 def test_sequence_timestamp_with_timezone_inference():
1095     data = [
1096         datetime.datetime(2007, 7, 13, 8, 23, 34, 123456),  # naive
1097         pytz.utc.localize(
1098             datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1099         ),
1100         None,
1101         pytz.timezone('US/Eastern').localize(
1102             datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
1103         ),
1104         pytz.timezone('Europe/Moscow').localize(
1105             datetime.datetime(2010, 8, 13, 5, 0, 0, 437699)
1106         ),
1107     ]
1108     expected = [
1109         pa.timestamp('us', tz=None),
1110         pa.timestamp('us', tz='UTC'),
1111         pa.timestamp('us', tz=None),
1112         pa.timestamp('us', tz='US/Eastern'),
1113         pa.timestamp('us', tz='Europe/Moscow')
1114     ]
1115     for dt, expected_type in zip(data, expected):
1116         prepended = [dt] + data
1117         arr = pa.array(prepended)
1118         assert arr.type == expected_type
1119
1120
1121 @pytest.mark.pandas
1122 def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes():
1123     import pandas as pd
1124
1125     data = [
1126         pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'),
1127                      unit='ns'),
1128         datetime.datetime(2007, 7, 13, 8, 23, 34, 123456),  # naive
1129         pytz.utc.localize(
1130             datetime.datetime(2008, 1, 5, 5, 0, 0, 1000)
1131         ),
1132         None,
1133     ]
1134     utcdata = [
1135         data[0].astimezone(pytz.utc),
1136         pytz.utc.localize(data[1]),
1137         data[2].astimezone(pytz.utc),
1138         None,
1139     ]
1140
1141     arr = pa.array(data)
1142     assert arr.type == pa.timestamp('us', tz='US/Eastern')
1143
1144     values = arr.cast('int64')
1145     expected = [int(dt.timestamp() * 10**6) if dt else None for dt in utcdata]
1146     assert values.to_pylist() == expected
1147
1148
1149 def test_sequence_timestamp_out_of_bounds_nanosecond():
1150     # https://issues.apache.org/jira/browse/ARROW-9768
1151     # datetime outside of range supported for nanosecond resolution
1152     data = [datetime.datetime(2262, 4, 12)]
1153     with pytest.raises(ValueError, match="out of bounds"):
1154         pa.array(data, type=pa.timestamp('ns'))
1155
1156     # with microsecond resolution it works fine
1157     arr = pa.array(data, type=pa.timestamp('us'))
1158     assert arr.to_pylist() == data
1159
1160     # case where the naive is within bounds, but converted to UTC not
1161     tz = datetime.timezone(datetime.timedelta(hours=-1))
1162     data = [datetime.datetime(2262, 4, 11, 23, tzinfo=tz)]
1163     with pytest.raises(ValueError, match="out of bounds"):
1164         pa.array(data, type=pa.timestamp('ns'))
1165
1166     arr = pa.array(data, type=pa.timestamp('us'))
1167     assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)
1168
1169
1170 def test_sequence_numpy_timestamp():
1171     data = [
1172         np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
1173         None,
1174         np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
1175         np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
1176     ]
1177     arr = pa.array(data)
1178     assert len(arr) == 4
1179     assert arr.type == pa.timestamp('us')
1180     assert arr.null_count == 1
1181     assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1182                                                23, 34, 123456)
1183     assert arr[1].as_py() is None
1184     assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
1185                                                34, 56, 432539)
1186     assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
1187                                                46, 57, 437699)
1188
1189
1190 class MyDate(datetime.date):
1191     pass
1192
1193
1194 class MyDatetime(datetime.datetime):
1195     pass
1196
1197
1198 class MyTimedelta(datetime.timedelta):
1199     pass
1200
1201
1202 def test_datetime_subclassing():
1203     data = [
1204         MyDate(2007, 7, 13),
1205     ]
1206     date_type = pa.date32()
1207     arr_date = pa.array(data, type=date_type)
1208     assert len(arr_date) == 1
1209     assert arr_date.type == date_type
1210     assert arr_date[0].as_py() == datetime.date(2007, 7, 13)
1211
1212     data = [
1213         MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
1214     ]
1215
1216     s = pa.timestamp('s')
1217     ms = pa.timestamp('ms')
1218     us = pa.timestamp('us')
1219
1220     arr_s = pa.array(data, type=s)
1221     assert len(arr_s) == 1
1222     assert arr_s.type == s
1223     assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1224                                                  23, 34, 0)
1225
1226     arr_ms = pa.array(data, type=ms)
1227     assert len(arr_ms) == 1
1228     assert arr_ms.type == ms
1229     assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1230                                                   23, 34, 123000)
1231
1232     arr_us = pa.array(data, type=us)
1233     assert len(arr_us) == 1
1234     assert arr_us.type == us
1235     assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1236                                                   23, 34, 123456)
1237
1238     data = [
1239         MyTimedelta(123, 456, 1002),
1240     ]
1241
1242     s = pa.duration('s')
1243     ms = pa.duration('ms')
1244     us = pa.duration('us')
1245
1246     arr_s = pa.array(data)
1247     assert len(arr_s) == 1
1248     assert arr_s.type == us
1249     assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002)
1250
1251     arr_s = pa.array(data, type=s)
1252     assert len(arr_s) == 1
1253     assert arr_s.type == s
1254     assert arr_s[0].as_py() == datetime.timedelta(123, 456)
1255
1256     arr_ms = pa.array(data, type=ms)
1257     assert len(arr_ms) == 1
1258     assert arr_ms.type == ms
1259     assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000)
1260
1261     arr_us = pa.array(data, type=us)
1262     assert len(arr_us) == 1
1263     assert arr_us.type == us
1264     assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
1265
1266
1267 @pytest.mark.xfail(not _pandas_api.have_pandas,
1268                    reason="pandas required for nanosecond conversion")
1269 def test_sequence_timestamp_nanoseconds():
1270     inputs = [
1271         [datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)],
1272         [MyDatetime(2007, 7, 13, 1, 23, 34, 123456)]
1273     ]
1274
1275     for data in inputs:
1276         ns = pa.timestamp('ns')
1277         arr_ns = pa.array(data, type=ns)
1278         assert len(arr_ns) == 1
1279         assert arr_ns.type == ns
1280         assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
1281                                                       23, 34, 123456)
1282
1283
1284 @pytest.mark.pandas
1285 def test_sequence_timestamp_from_int_with_unit():
1286     # TODO(wesm): This test might be rewritten to assert the actual behavior
1287     # when pandas is not installed
1288
1289     data = [1]
1290
1291     s = pa.timestamp('s')
1292     ms = pa.timestamp('ms')
1293     us = pa.timestamp('us')
1294     ns = pa.timestamp('ns')
1295
1296     arr_s = pa.array(data, type=s)
1297     assert len(arr_s) == 1
1298     assert arr_s.type == s
1299     assert repr(arr_s[0]) == (
1300         "<pyarrow.TimestampScalar: datetime.datetime(1970, 1, 1, 0, 0, 1)>"
1301     )
1302     assert str(arr_s[0]) == "1970-01-01 00:00:01"
1303
1304     arr_ms = pa.array(data, type=ms)
1305     assert len(arr_ms) == 1
1306     assert arr_ms.type == ms
1307     assert repr(arr_ms[0].as_py()) == (
1308         "datetime.datetime(1970, 1, 1, 0, 0, 0, 1000)"
1309     )
1310     assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000"
1311
1312     arr_us = pa.array(data, type=us)
1313     assert len(arr_us) == 1
1314     assert arr_us.type == us
1315     assert repr(arr_us[0].as_py()) == (
1316         "datetime.datetime(1970, 1, 1, 0, 0, 0, 1)"
1317     )
1318     assert str(arr_us[0]) == "1970-01-01 00:00:00.000001"
1319
1320     arr_ns = pa.array(data, type=ns)
1321     assert len(arr_ns) == 1
1322     assert arr_ns.type == ns
1323     assert repr(arr_ns[0].as_py()) == (
1324         "Timestamp('1970-01-01 00:00:00.000000001')"
1325     )
1326     assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001"
1327
1328     expected_exc = TypeError
1329
1330     class CustomClass():
1331         pass
1332
1333     for ty in [ns, pa.date32(), pa.date64()]:
1334         with pytest.raises(expected_exc):
1335             pa.array([1, CustomClass()], type=ty)
1336
1337
1338 @pytest.mark.parametrize('np_scalar', [True, False])
1339 def test_sequence_duration(np_scalar):
1340     td1 = datetime.timedelta(2, 3601, 1)
1341     td2 = datetime.timedelta(1, 100, 1000)
1342     if np_scalar:
1343         data = [np.timedelta64(td1), None, np.timedelta64(td2)]
1344     else:
1345         data = [td1, None, td2]
1346
1347     arr = pa.array(data)
1348     assert len(arr) == 3
1349     assert arr.type == pa.duration('us')
1350     assert arr.null_count == 1
1351     assert arr[0].as_py() == td1
1352     assert arr[1].as_py() is None
1353     assert arr[2].as_py() == td2
1354
1355
1356 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1357 def test_sequence_duration_with_unit(unit):
1358     data = [
1359         datetime.timedelta(3, 22, 1001),
1360     ]
1361     expected = {'s': datetime.timedelta(3, 22),
1362                 'ms': datetime.timedelta(3, 22, 1000),
1363                 'us': datetime.timedelta(3, 22, 1001),
1364                 'ns': datetime.timedelta(3, 22, 1001)}
1365
1366     ty = pa.duration(unit)
1367
1368     arr_s = pa.array(data, type=ty)
1369     assert len(arr_s) == 1
1370     assert arr_s.type == ty
1371     assert arr_s[0].as_py() == expected[unit]
1372
1373
1374 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
1375 def test_sequence_duration_from_int_with_unit(unit):
1376     data = [5]
1377
1378     ty = pa.duration(unit)
1379     arr = pa.array(data, type=ty)
1380     assert len(arr) == 1
1381     assert arr.type == ty
1382     assert arr[0].value == 5
1383
1384
1385 def test_sequence_duration_nested_lists():
1386     td1 = datetime.timedelta(1, 1, 1000)
1387     td2 = datetime.timedelta(1, 100)
1388
1389     data = [[td1, None], [td1, td2]]
1390
1391     arr = pa.array(data)
1392     assert len(arr) == 2
1393     assert arr.type == pa.list_(pa.duration('us'))
1394     assert arr.to_pylist() == data
1395
1396     arr = pa.array(data, type=pa.list_(pa.duration('ms')))
1397     assert len(arr) == 2
1398     assert arr.type == pa.list_(pa.duration('ms'))
1399     assert arr.to_pylist() == data
1400
1401
1402 def test_sequence_duration_nested_lists_numpy():
1403     td1 = datetime.timedelta(1, 1, 1000)
1404     td2 = datetime.timedelta(1, 100)
1405
1406     data = [[np.timedelta64(td1), None],
1407             [np.timedelta64(td1), np.timedelta64(td2)]]
1408
1409     arr = pa.array(data)
1410     assert len(arr) == 2
1411     assert arr.type == pa.list_(pa.duration('us'))
1412     assert arr.to_pylist() == [[td1, None], [td1, td2]]
1413
1414     data = [np.array([np.timedelta64(td1), None], dtype='timedelta64[us]'),
1415             np.array([np.timedelta64(td1), np.timedelta64(td2)])]
1416
1417     arr = pa.array(data)
1418     assert len(arr) == 2
1419     assert arr.type == pa.list_(pa.duration('us'))
1420     assert arr.to_pylist() == [[td1, None], [td1, td2]]
1421
1422
1423 def test_sequence_nesting_levels():
1424     data = [1, 2, None]
1425     arr = pa.array(data)
1426     assert arr.type == pa.int64()
1427     assert arr.to_pylist() == data
1428
1429     data = [[1], [2], None]
1430     arr = pa.array(data)
1431     assert arr.type == pa.list_(pa.int64())
1432     assert arr.to_pylist() == data
1433
1434     data = [[1], [2, 3, 4], [None]]
1435     arr = pa.array(data)
1436     assert arr.type == pa.list_(pa.int64())
1437     assert arr.to_pylist() == data
1438
1439     data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
1440     arr = pa.array(data)
1441     assert arr.type == pa.list_(pa.list_(pa.int64()))
1442     assert arr.to_pylist() == data
1443
1444     exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)
1445
1446     # Mixed nesting levels are rejected
1447     with pytest.raises(exceptions):
1448         pa.array([1, 2, [1]])
1449
1450     with pytest.raises(exceptions):
1451         pa.array([1, 2, []])
1452
1453     with pytest.raises(exceptions):
1454         pa.array([[1], [2], [None, [1]]])
1455
1456
1457 def test_sequence_mixed_types_fails():
1458     data = ['a', 1, 2.0]
1459     with pytest.raises(pa.ArrowTypeError):
1460         pa.array(data)
1461
1462
1463 def test_sequence_mixed_types_with_specified_type_fails():
1464     data = ['-10', '-5', {'a': 1}, '0', '5', '10']
1465
1466     type = pa.string()
1467     with pytest.raises(TypeError):
1468         pa.array(data, type=type)
1469
1470
1471 def test_sequence_decimal():
1472     data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
1473     for type in [pa.decimal128, pa.decimal256]:
1474         arr = pa.array(data, type=type(precision=7, scale=3))
1475         assert arr.to_pylist() == data
1476
1477
1478 def test_sequence_decimal_different_precisions():
1479     data = [
1480         decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
1481     ]
1482     for type in [pa.decimal128, pa.decimal256]:
1483         arr = pa.array(data, type=type(precision=13, scale=3))
1484         assert arr.to_pylist() == data
1485
1486
1487 def test_sequence_decimal_no_scale():
1488     data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
1489     for type in [pa.decimal128, pa.decimal256]:
1490         arr = pa.array(data, type=type(precision=10))
1491         assert arr.to_pylist() == data
1492
1493
1494 def test_sequence_decimal_negative():
1495     data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
1496     for type in [pa.decimal128, pa.decimal256]:
1497         arr = pa.array(data, type=type(precision=10, scale=6))
1498         assert arr.to_pylist() == data
1499
1500
1501 def test_sequence_decimal_no_whole_part():
1502     data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
1503     for type in [pa.decimal128, pa.decimal256]:
1504         arr = pa.array(data, type=type(precision=7, scale=7))
1505         assert arr.to_pylist() == data
1506
1507
1508 def test_sequence_decimal_large_integer():
1509     data = [decimal.Decimal('-394029506937548693.42983'),
1510             decimal.Decimal('32358695912932.01033')]
1511     for type in [pa.decimal128, pa.decimal256]:
1512         arr = pa.array(data, type=type(precision=23, scale=5))
1513         assert arr.to_pylist() == data
1514
1515
1516 def test_sequence_decimal_from_integers():
1517     data = [0, 1, -39402950693754869342983]
1518     expected = [decimal.Decimal(x) for x in data]
1519     for type in [pa.decimal128, pa.decimal256]:
1520         arr = pa.array(data, type=type(precision=28, scale=5))
1521         assert arr.to_pylist() == expected
1522
1523
1524 def test_sequence_decimal_too_high_precision():
1525     # ARROW-6989 python decimal has too high precision
1526     with pytest.raises(ValueError, match="precision out of range"):
1527         pa.array([decimal.Decimal('1' * 80)])
1528
1529
1530 def test_sequence_decimal_infer():
1531     for data, typ in [
1532         # simple case
1533         (decimal.Decimal('1.234'), pa.decimal128(4, 3)),
1534         # trailing zeros
1535         (decimal.Decimal('12300'), pa.decimal128(5, 0)),
1536         (decimal.Decimal('12300.0'), pa.decimal128(6, 1)),
1537         # scientific power notation
1538         (decimal.Decimal('1.23E+4'), pa.decimal128(5, 0)),
1539         (decimal.Decimal('123E+2'), pa.decimal128(5, 0)),
1540         (decimal.Decimal('123E+4'), pa.decimal128(7, 0)),
1541         # leading zeros
1542         (decimal.Decimal('0.0123'), pa.decimal128(4, 4)),
1543         (decimal.Decimal('0.01230'), pa.decimal128(5, 5)),
1544         (decimal.Decimal('1.230E-2'), pa.decimal128(5, 5)),
1545     ]:
1546         assert pa.infer_type([data]) == typ
1547         arr = pa.array([data])
1548         assert arr.type == typ
1549         assert arr.to_pylist()[0] == data
1550
1551
1552 def test_sequence_decimal_infer_mixed():
1553     # ARROW-12150 - ensure mixed precision gets correctly inferred to
1554     # common type that can hold all input values
1555     cases = [
1556         ([decimal.Decimal('1.234'), decimal.Decimal('3.456')],
1557          pa.decimal128(4, 3)),
1558         ([decimal.Decimal('1.234'), decimal.Decimal('456.7')],
1559          pa.decimal128(6, 3)),
1560         ([decimal.Decimal('123.4'), decimal.Decimal('4.567')],
1561          pa.decimal128(6, 3)),
1562         ([decimal.Decimal('123e2'), decimal.Decimal('4567e3')],
1563          pa.decimal128(7, 0)),
1564         ([decimal.Decimal('123e4'), decimal.Decimal('4567e2')],
1565          pa.decimal128(7, 0)),
1566         ([decimal.Decimal('0.123'), decimal.Decimal('0.04567')],
1567          pa.decimal128(5, 5)),
1568         ([decimal.Decimal('0.001'), decimal.Decimal('1.01E5')],
1569          pa.decimal128(9, 3)),
1570     ]
1571     for data, typ in cases:
1572         assert pa.infer_type(data) == typ
1573         arr = pa.array(data)
1574         assert arr.type == typ
1575         assert arr.to_pylist() == data
1576
1577
1578 def test_sequence_decimal_given_type():
1579     for data, typs, wrong_typs in [
1580         # simple case
1581         (
1582             decimal.Decimal('1.234'),
1583             [pa.decimal128(4, 3), pa.decimal128(5, 3), pa.decimal128(5, 4)],
1584             [pa.decimal128(4, 2), pa.decimal128(4, 4)]
1585         ),
1586         # trailing zeros
1587         (
1588             decimal.Decimal('12300'),
1589             [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)],
1590             [pa.decimal128(4, 0), pa.decimal128(3, -3)]
1591         ),
1592         # scientific power notation
1593         (
1594             decimal.Decimal('1.23E+4'),
1595             [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)],
1596             [pa.decimal128(4, 0), pa.decimal128(3, -3)]
1597         ),
1598     ]:
1599         for typ in typs:
1600             arr = pa.array([data], type=typ)
1601             assert arr.type == typ
1602             assert arr.to_pylist()[0] == data
1603         for typ in wrong_typs:
1604             with pytest.raises(ValueError):
1605                 pa.array([data], type=typ)
1606
1607
1608 def test_range_types():
1609     arr1 = pa.array(range(3))
1610     arr2 = pa.array((0, 1, 2))
1611     assert arr1.equals(arr2)
1612
1613
1614 def test_empty_range():
1615     arr = pa.array(range(0))
1616     assert len(arr) == 0
1617     assert arr.null_count == 0
1618     assert arr.type == pa.null()
1619     assert arr.to_pylist() == []
1620
1621
1622 def test_structarray():
1623     arr = pa.StructArray.from_arrays([], names=[])
1624     assert arr.type == pa.struct([])
1625     assert len(arr) == 0
1626     assert arr.to_pylist() == []
1627
1628     ints = pa.array([None, 2, 3], type=pa.int64())
1629     strs = pa.array(['a', None, 'c'], type=pa.string())
1630     bools = pa.array([True, False, None], type=pa.bool_())
1631     arr = pa.StructArray.from_arrays(
1632         [ints, strs, bools],
1633         ['ints', 'strs', 'bools'])
1634
1635     expected = [
1636         {'ints': None, 'strs': 'a', 'bools': True},
1637         {'ints': 2, 'strs': None, 'bools': False},
1638         {'ints': 3, 'strs': 'c', 'bools': None},
1639     ]
1640
1641     pylist = arr.to_pylist()
1642     assert pylist == expected, (pylist, expected)
1643
1644     # len(names) != len(arrays)
1645     with pytest.raises(ValueError):
1646         pa.StructArray.from_arrays([ints], ['ints', 'strs'])
1647
1648
1649 def test_struct_from_dicts():
1650     ty = pa.struct([pa.field('a', pa.int32()),
1651                     pa.field('b', pa.string()),
1652                     pa.field('c', pa.bool_())])
1653     arr = pa.array([], type=ty)
1654     assert arr.to_pylist() == []
1655
1656     data = [{'a': 5, 'b': 'foo', 'c': True},
1657             {'a': 6, 'b': 'bar', 'c': False}]
1658     arr = pa.array(data, type=ty)
1659     assert arr.to_pylist() == data
1660
1661     # With omitted values
1662     data = [{'a': 5, 'c': True},
1663             None,
1664             {},
1665             {'a': None, 'b': 'bar'}]
1666     arr = pa.array(data, type=ty)
1667     expected = [{'a': 5, 'b': None, 'c': True},
1668                 None,
1669                 {'a': None, 'b': None, 'c': None},
1670                 {'a': None, 'b': 'bar', 'c': None}]
1671     assert arr.to_pylist() == expected
1672
1673
1674 def test_struct_from_dicts_bytes_keys():
1675     # ARROW-6878
1676     ty = pa.struct([pa.field('a', pa.int32()),
1677                     pa.field('b', pa.string()),
1678                     pa.field('c', pa.bool_())])
1679     arr = pa.array([], type=ty)
1680     assert arr.to_pylist() == []
1681
1682     data = [{b'a': 5, b'b': 'foo'},
1683             {b'a': 6, b'c': False}]
1684     arr = pa.array(data, type=ty)
1685     assert arr.to_pylist() == [
1686         {'a': 5, 'b': 'foo', 'c': None},
1687         {'a': 6, 'b': None, 'c': False},
1688     ]
1689
1690
1691 def test_struct_from_tuples():
1692     ty = pa.struct([pa.field('a', pa.int32()),
1693                     pa.field('b', pa.string()),
1694                     pa.field('c', pa.bool_())])
1695
1696     data = [(5, 'foo', True),
1697             (6, 'bar', False)]
1698     expected = [{'a': 5, 'b': 'foo', 'c': True},
1699                 {'a': 6, 'b': 'bar', 'c': False}]
1700     arr = pa.array(data, type=ty)
1701
1702     data_as_ndarray = np.empty(len(data), dtype=object)
1703     data_as_ndarray[:] = data
1704     arr2 = pa.array(data_as_ndarray, type=ty)
1705     assert arr.to_pylist() == expected
1706
1707     assert arr.equals(arr2)
1708
1709     # With omitted values
1710     data = [(5, 'foo', None),
1711             None,
1712             (6, None, False)]
1713     expected = [{'a': 5, 'b': 'foo', 'c': None},
1714                 None,
1715                 {'a': 6, 'b': None, 'c': False}]
1716     arr = pa.array(data, type=ty)
1717     assert arr.to_pylist() == expected
1718
1719     # Invalid tuple size
1720     for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
1721         with pytest.raises(ValueError, match="(?i)tuple size"):
1722             pa.array([tup], type=ty)
1723
1724
1725 def test_struct_from_list_of_pairs():
1726     ty = pa.struct([
1727         pa.field('a', pa.int32()),
1728         pa.field('b', pa.string()),
1729         pa.field('c', pa.bool_())
1730     ])
1731     data = [
1732         [('a', 5), ('b', 'foo'), ('c', True)],
1733         [('a', 6), ('b', 'bar'), ('c', False)],
1734         None
1735     ]
1736     arr = pa.array(data, type=ty)
1737     assert arr.to_pylist() == [
1738         {'a': 5, 'b': 'foo', 'c': True},
1739         {'a': 6, 'b': 'bar', 'c': False},
1740         None
1741     ]
1742
1743     # test with duplicated field names
1744     ty = pa.struct([
1745         pa.field('a', pa.int32()),
1746         pa.field('a', pa.string()),
1747         pa.field('b', pa.bool_())
1748     ])
1749     data = [
1750         [('a', 5), ('a', 'foo'), ('b', True)],
1751         [('a', 6), ('a', 'bar'), ('b', False)],
1752     ]
1753     arr = pa.array(data, type=ty)
1754     with pytest.raises(ValueError):
1755         # TODO(kszucs): ARROW-9997
1756         arr.to_pylist()
1757
1758     # test with empty elements
1759     ty = pa.struct([
1760         pa.field('a', pa.int32()),
1761         pa.field('b', pa.string()),
1762         pa.field('c', pa.bool_())
1763     ])
1764     data = [
1765         [],
1766         [('a', 5), ('b', 'foo'), ('c', True)],
1767         [('a', 2), ('b', 'baz')],
1768         [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')],
1769     ]
1770     expected = [
1771         {'a': None, 'b': None, 'c': None},
1772         {'a': 5, 'b': 'foo', 'c': True},
1773         {'a': 2, 'b': 'baz', 'c': None},
1774         {'a': 1, 'b': 'bar', 'c': False},
1775     ]
1776     arr = pa.array(data, type=ty)
1777     assert arr.to_pylist() == expected
1778
1779
1780 def test_struct_from_list_of_pairs_errors():
1781     ty = pa.struct([
1782         pa.field('a', pa.int32()),
1783         pa.field('b', pa.string()),
1784         pa.field('c', pa.bool_())
1785     ])
1786
1787     # test that it raises if the key doesn't match the expected field name
1788     data = [
1789         [],
1790         [('a', 5), ('c', True), ('b', None)],
1791     ]
1792     msg = "The expected field name is `b` but `c` was given"
1793     with pytest.raises(ValueError, match=msg):
1794         pa.array(data, type=ty)
1795
1796     # test various errors both at the first position and after because of key
1797     # type inference
1798     template = (
1799         r"Could not convert {} with type {}: was expecting tuple of "
1800         r"(key, value) pair"
1801     )
1802     cases = [
1803         tuple(),  # empty key-value pair
1804         tuple('a',),  # missing value
1805         tuple('unknown-key',),  # not known field name
1806         'string',  # not a tuple
1807     ]
1808     for key_value_pair in cases:
1809         msg = re.escape(template.format(
1810             repr(key_value_pair), type(key_value_pair).__name__
1811         ))
1812
1813         with pytest.raises(TypeError, match=msg):
1814             pa.array([
1815                 [key_value_pair],
1816                 [('a', 5), ('b', 'foo'), ('c', None)],
1817             ], type=ty)
1818
1819         with pytest.raises(TypeError, match=msg):
1820             pa.array([
1821                 [('a', 5), ('b', 'foo'), ('c', None)],
1822                 [key_value_pair],
1823             ], type=ty)
1824
1825
1826 def test_struct_from_mixed_sequence():
1827     # It is forbidden to mix dicts and tuples when initializing a struct array
1828     ty = pa.struct([pa.field('a', pa.int32()),
1829                     pa.field('b', pa.string()),
1830                     pa.field('c', pa.bool_())])
1831     data = [(5, 'foo', True),
1832             {'a': 6, 'b': 'bar', 'c': False}]
1833     with pytest.raises(TypeError):
1834         pa.array(data, type=ty)
1835
1836
1837 def test_struct_from_dicts_inference():
1838     expected_type = pa.struct([pa.field('a', pa.int64()),
1839                                pa.field('b', pa.string()),
1840                                pa.field('c', pa.bool_())])
1841     data = [{'a': 5, 'b': 'foo', 'c': True},
1842             {'a': 6, 'b': 'bar', 'c': False}]
1843
1844     arr = pa.array(data)
1845     check_struct_type(arr.type, expected_type)
1846     assert arr.to_pylist() == data
1847
1848     # With omitted values
1849     data = [{'a': 5, 'c': True},
1850             None,
1851             {},
1852             {'a': None, 'b': 'bar'}]
1853     expected = [{'a': 5, 'b': None, 'c': True},
1854                 None,
1855                 {'a': None, 'b': None, 'c': None},
1856                 {'a': None, 'b': 'bar', 'c': None}]
1857
1858     arr = pa.array(data)
1859     data_as_ndarray = np.empty(len(data), dtype=object)
1860     data_as_ndarray[:] = data
1861     arr2 = pa.array(data)
1862
1863     check_struct_type(arr.type, expected_type)
1864     assert arr.to_pylist() == expected
1865     assert arr.equals(arr2)
1866
1867     # Nested
1868     expected_type = pa.struct([
1869         pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
1870                                  pa.field('ab', pa.bool_())])),
1871         pa.field('b', pa.string())])
1872     data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
1873             {'a': {'aa': None, 'ab': False}, 'b': None},
1874             {'a': None, 'b': 'bar'}]
1875     arr = pa.array(data)
1876
1877     assert arr.to_pylist() == data
1878
1879     # Edge cases
1880     arr = pa.array([{}])
1881     assert arr.type == pa.struct([])
1882     assert arr.to_pylist() == [{}]
1883
1884     # Mixing structs and scalars is rejected
1885     with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
1886         pa.array([1, {'a': 2}])
1887
1888
1889 def test_structarray_from_arrays_coerce():
1890     # ARROW-1706
1891     ints = [None, 2, 3]
1892     strs = ['a', None, 'c']
1893     bools = [True, False, None]
1894     ints_nonnull = [1, 2, 3]
1895
1896     arrays = [ints, strs, bools, ints_nonnull]
1897     result = pa.StructArray.from_arrays(arrays,
1898                                         ['ints', 'strs', 'bools',
1899                                          'int_nonnull'])
1900     expected = pa.StructArray.from_arrays(
1901         [pa.array(ints, type='int64'),
1902          pa.array(strs, type='utf8'),
1903          pa.array(bools),
1904          pa.array(ints_nonnull, type='int64')],
1905         ['ints', 'strs', 'bools', 'int_nonnull'])
1906
1907     with pytest.raises(ValueError):
1908         pa.StructArray.from_arrays(arrays)
1909
1910     assert result.equals(expected)
1911
1912
1913 def test_decimal_array_with_none_and_nan():
1914     values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
1915
1916     with pytest.raises(TypeError):
1917         # ARROW-6227: Without from_pandas=True, NaN is considered a float
1918         array = pa.array(values)
1919
1920     array = pa.array(values, from_pandas=True)
1921     assert array.type == pa.decimal128(4, 3)
1922     assert array.to_pylist() == values[:2] + [None, None]
1923
1924     array = pa.array(values, type=pa.decimal128(10, 4), from_pandas=True)
1925     assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
1926
1927
1928 def test_map_from_dicts():
1929     data = [[{'key': b'a', 'value': 1}, {'key': b'b', 'value': 2}],
1930             [{'key': b'c', 'value': 3}],
1931             [{'key': b'd', 'value': 4}, {'key': b'e', 'value': 5},
1932              {'key': b'f', 'value': None}],
1933             [{'key': b'g', 'value': 7}]]
1934     expected = [[(d['key'], d['value']) for d in entry] for entry in data]
1935
1936     arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1937
1938     assert arr.to_pylist() == expected
1939
1940     # With omitted values
1941     data[1] = None
1942     expected[1] = None
1943
1944     arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1945
1946     assert arr.to_pylist() == expected
1947
1948     # Invalid dictionary
1949     for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]:
1950         with pytest.raises(ValueError, match="Invalid Map"):
1951             pa.array([entry], type=pa.map_('i4', 'i4'))
1952
1953     # Invalid dictionary types
1954     for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
1955         with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
1956             pa.array([entry], type=pa.map_('i4', 'i4'))
1957
1958
1959 def test_map_from_tuples():
1960     expected = [[(b'a', 1), (b'b', 2)],
1961                 [(b'c', 3)],
1962                 [(b'd', 4), (b'e', 5), (b'f', None)],
1963                 [(b'g', 7)]]
1964
1965     arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1966
1967     assert arr.to_pylist() == expected
1968
1969     # With omitted values
1970     expected[1] = None
1971
1972     arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
1973
1974     assert arr.to_pylist() == expected
1975
1976     # Invalid tuple size
1977     for entry in [[(5,)], [()], [('5', 'foo', True)]]:
1978         with pytest.raises(ValueError, match="(?i)tuple size"):
1979             pa.array([entry], type=pa.map_('i4', 'i4'))
1980
1981
1982 def test_dictionary_from_boolean():
1983     typ = pa.dictionary(pa.int8(), value_type=pa.bool_())
1984     a = pa.array([False, False, True, False, True], type=typ)
1985     assert isinstance(a.type, pa.DictionaryType)
1986     assert a.type.equals(typ)
1987
1988     expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8())
1989     expected_dictionary = pa.array([False, True], type=pa.bool_())
1990     assert a.indices.equals(expected_indices)
1991     assert a.dictionary.equals(expected_dictionary)
1992
1993
1994 @pytest.mark.parametrize('value_type', [
1995     pa.int8(),
1996     pa.int16(),
1997     pa.int32(),
1998     pa.int64(),
1999     pa.uint8(),
2000     pa.uint16(),
2001     pa.uint32(),
2002     pa.uint64(),
2003     pa.float32(),
2004     pa.float64(),
2005 ])
2006 def test_dictionary_from_integers(value_type):
2007     typ = pa.dictionary(pa.int8(), value_type=value_type)
2008     a = pa.array([1, 2, 1, 1, 2, 3], type=typ)
2009     assert isinstance(a.type, pa.DictionaryType)
2010     assert a.type.equals(typ)
2011
2012     expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8())
2013     expected_dictionary = pa.array([1, 2, 3], type=value_type)
2014     assert a.indices.equals(expected_indices)
2015     assert a.dictionary.equals(expected_dictionary)
2016
2017
2018 @pytest.mark.parametrize('input_index_type', [
2019     pa.int8(),
2020     pa.int16(),
2021     pa.int32(),
2022     pa.int64()
2023 ])
2024 def test_dictionary_index_type(input_index_type):
2025     # dictionary array is constructed using adaptive index type builder,
2026     # but the input index type is considered as the minimal width type to use
2027
2028     typ = pa.dictionary(input_index_type, value_type=pa.int64())
2029     arr = pa.array(range(10), type=typ)
2030     assert arr.type.equals(typ)
2031
2032
2033 def test_dictionary_is_always_adaptive():
2034     # dictionary array is constructed using adaptive index type builder,
2035     # meaning that the output index type may be wider than the given index type
2036     # since it depends on the input data
2037     typ = pa.dictionary(pa.int8(), value_type=pa.int64())
2038
2039     a = pa.array(range(2**7), type=typ)
2040     expected = pa.dictionary(pa.int8(), pa.int64())
2041     assert a.type.equals(expected)
2042
2043     a = pa.array(range(2**7 + 1), type=typ)
2044     expected = pa.dictionary(pa.int16(), pa.int64())
2045     assert a.type.equals(expected)
2046
2047
2048 def test_dictionary_from_strings():
2049     for value_type in [pa.binary(), pa.string()]:
2050         typ = pa.dictionary(pa.int8(), value_type)
2051         a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ)
2052
2053         assert isinstance(a.type, pa.DictionaryType)
2054
2055         expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8())
2056         expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type)
2057         assert a.indices.equals(expected_indices)
2058         assert a.dictionary.equals(expected_dictionary)
2059
2060     # fixed size binary type
2061     typ = pa.dictionary(pa.int8(), pa.binary(3))
2062     a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ)
2063     assert isinstance(a.type, pa.DictionaryType)
2064
2065     expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8())
2066     expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3))
2067     assert a.indices.equals(expected_indices)
2068     assert a.dictionary.equals(expected_dictionary)
2069
2070
2071 @pytest.mark.parametrize(('unit', 'expected'), [
2072     ('s', datetime.timedelta(seconds=-2147483000)),
2073     ('ms', datetime.timedelta(milliseconds=-2147483000)),
2074     ('us', datetime.timedelta(microseconds=-2147483000)),
2075     ('ns', datetime.timedelta(microseconds=-2147483))
2076 ])
2077 def test_duration_array_roundtrip_corner_cases(unit, expected):
2078     # Corner case discovered by hypothesis: there were implicit conversions to
2079     # unsigned values resulting wrong values with wrong signs.
2080     ty = pa.duration(unit)
2081     arr = pa.array([-2147483000], type=ty)
2082     restored = pa.array(arr.to_pylist(), type=ty)
2083     assert arr.equals(restored)
2084
2085     expected_list = [expected]
2086     if unit == 'ns':
2087         # if pandas is available then a pandas Timedelta is returned
2088         try:
2089             import pandas as pd
2090         except ImportError:
2091             pass
2092         else:
2093             expected_list = [pd.Timedelta(-2147483000, unit='ns')]
2094
2095     assert restored.to_pylist() == expected_list
2096
2097
2098 @pytest.mark.pandas
2099 def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
2100     # corner case discovered by hypothesis: preserving the nanoseconds on
2101     # conversion from a list of Timedelta and Timestamp objects
2102     import pandas as pd
2103
2104     ty = pa.duration('ns')
2105     arr = pa.array([9223371273709551616], type=ty)
2106     data = arr.to_pylist()
2107     assert isinstance(data[0], pd.Timedelta)
2108     restored = pa.array(data, type=ty)
2109     assert arr.equals(restored)
2110     assert restored.to_pylist() == [
2111         pd.Timedelta(9223371273709551616, unit='ns')
2112     ]
2113
2114     ty = pa.timestamp('ns')
2115     arr = pa.array([9223371273709551616], type=ty)
2116     data = arr.to_pylist()
2117     assert isinstance(data[0], pd.Timestamp)
2118     restored = pa.array(data, type=ty)
2119     assert arr.equals(restored)
2120     assert restored.to_pylist() == [
2121         pd.Timestamp(9223371273709551616, unit='ns')
2122     ]
2123
2124     ty = pa.timestamp('ns', tz='US/Eastern')
2125     value = 1604119893000000000
2126     arr = pa.array([value], type=ty)
2127     data = arr.to_pylist()
2128     assert isinstance(data[0], pd.Timestamp)
2129     restored = pa.array(data, type=ty)
2130     assert arr.equals(restored)
2131     assert restored.to_pylist() == [
2132         pd.Timestamp(value, unit='ns').tz_localize(
2133             "UTC").tz_convert('US/Eastern')
2134     ]
2135
2136
2137 @h.given(past.all_arrays)
2138 def test_array_to_pylist_roundtrip(arr):
2139     seq = arr.to_pylist()
2140     restored = pa.array(seq, type=arr.type)
2141     assert restored.equals(arr)
2142
2143
2144 @pytest.mark.large_memory
2145 def test_auto_chunking_binary_like():
2146     # single chunk
2147     v1 = b'x' * 100000000
2148     v2 = b'x' * 147483646
2149
2150     # single chunk
2151     one_chunk_data = [v1] * 20 + [b'', None, v2]
2152     arr = pa.array(one_chunk_data, type=pa.binary())
2153     assert isinstance(arr, pa.Array)
2154     assert len(arr) == 23
2155     assert arr[20].as_py() == b''
2156     assert arr[21].as_py() is None
2157     assert arr[22].as_py() == v2
2158
2159     # two chunks
2160     two_chunk_data = one_chunk_data + [b'two']
2161     arr = pa.array(two_chunk_data, type=pa.binary())
2162     assert isinstance(arr, pa.ChunkedArray)
2163     assert arr.num_chunks == 2
2164     assert len(arr.chunk(0)) == 23
2165     assert len(arr.chunk(1)) == 1
2166     assert arr.chunk(0)[20].as_py() == b''
2167     assert arr.chunk(0)[21].as_py() is None
2168     assert arr.chunk(0)[22].as_py() == v2
2169     assert arr.chunk(1).to_pylist() == [b'two']
2170
2171     # three chunks
2172     three_chunk_data = one_chunk_data * 2 + [b'three', b'three']
2173     arr = pa.array(three_chunk_data, type=pa.binary())
2174     assert isinstance(arr, pa.ChunkedArray)
2175     assert arr.num_chunks == 3
2176     assert len(arr.chunk(0)) == 23
2177     assert len(arr.chunk(1)) == 23
2178     assert len(arr.chunk(2)) == 2
2179     for i in range(2):
2180         assert arr.chunk(i)[20].as_py() == b''
2181         assert arr.chunk(i)[21].as_py() is None
2182         assert arr.chunk(i)[22].as_py() == v2
2183     assert arr.chunk(2).to_pylist() == [b'three', b'three']
2184
2185
2186 @pytest.mark.large_memory
2187 def test_auto_chunking_list_of_binary():
2188     # ARROW-6281
2189     vals = [['x' * 1024]] * ((2 << 20) + 1)
2190     arr = pa.array(vals)
2191     assert isinstance(arr, pa.ChunkedArray)
2192     assert arr.num_chunks == 2
2193     assert len(arr.chunk(0)) == 2**21 - 1
2194     assert len(arr.chunk(1)) == 2
2195     assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2
2196
2197
2198 @pytest.mark.large_memory
2199 def test_auto_chunking_list_like():
2200     item = np.ones((2**28,), dtype='uint8')
2201     data = [item] * (2**3 - 1)
2202     arr = pa.array(data, type=pa.list_(pa.uint8()))
2203     assert isinstance(arr, pa.Array)
2204     assert len(arr) == 7
2205
2206     item = np.ones((2**28,), dtype='uint8')
2207     data = [item] * 2**3
2208     arr = pa.array(data, type=pa.list_(pa.uint8()))
2209     assert isinstance(arr, pa.ChunkedArray)
2210     assert arr.num_chunks == 2
2211     assert len(arr.chunk(0)) == 7
2212     assert len(arr.chunk(1)) == 1
2213     chunk = arr.chunk(1)
2214     scalar = chunk[0]
2215     assert isinstance(scalar, pa.ListScalar)
2216     expected = pa.array(item, type=pa.uint8())
2217     assert scalar.values == expected
2218
2219
2220 @pytest.mark.slow
2221 @pytest.mark.large_memory
2222 def test_auto_chunking_map_type():
2223     # takes ~20 minutes locally
2224     ty = pa.map_(pa.int8(), pa.int8())
2225     item = [(1, 1)] * 2**28
2226     data = [item] * 2**3
2227     arr = pa.array(data, type=ty)
2228     assert isinstance(arr, pa.ChunkedArray)
2229     assert len(arr.chunk(0)) == 7
2230     assert len(arr.chunk(1)) == 1
2231
2232
2233 @pytest.mark.large_memory
2234 @pytest.mark.parametrize(('ty', 'char'), [
2235     (pa.string(), 'x'),
2236     (pa.binary(), b'x'),
2237 ])
2238 def test_nested_auto_chunking(ty, char):
2239     v1 = char * 100000000
2240     v2 = char * 147483646
2241
2242     struct_type = pa.struct([
2243         pa.field('bool', pa.bool_()),
2244         pa.field('integer', pa.int64()),
2245         pa.field('string-like', ty),
2246     ])
2247
2248     data = [{'bool': True, 'integer': 1, 'string-like': v1}] * 20
2249     data.append({'bool': True, 'integer': 1, 'string-like': v2})
2250     arr = pa.array(data, type=struct_type)
2251     assert isinstance(arr, pa.Array)
2252
2253     data.append({'bool': True, 'integer': 1, 'string-like': char})
2254     arr = pa.array(data, type=struct_type)
2255     assert isinstance(arr, pa.ChunkedArray)
2256     assert arr.num_chunks == 2
2257     assert len(arr.chunk(0)) == 21
2258     assert len(arr.chunk(1)) == 1
2259     assert arr.chunk(1)[0].as_py() == {
2260         'bool': True,
2261         'integer': 1,
2262         'string-like': char
2263     }
2264
2265
2266 @pytest.mark.large_memory
2267 def test_array_from_pylist_data_overflow():
2268     # Regression test for ARROW-12983
2269     # Data buffer overflow - should result in chunked array
2270     items = [b'a' * 4096] * (2 ** 19)
2271     arr = pa.array(items, type=pa.string())
2272     assert isinstance(arr, pa.ChunkedArray)
2273     assert len(arr) == 2**19
2274     assert len(arr.chunks) > 1
2275
2276     mask = np.zeros(2**19, bool)
2277     arr = pa.array(items, mask=mask, type=pa.string())
2278     assert isinstance(arr, pa.ChunkedArray)
2279     assert len(arr) == 2**19
2280     assert len(arr.chunks) > 1
2281
2282     arr = pa.array(items, type=pa.binary())
2283     assert isinstance(arr, pa.ChunkedArray)
2284     assert len(arr) == 2**19
2285     assert len(arr.chunks) > 1
2286
2287
2288 @pytest.mark.slow
2289 @pytest.mark.large_memory
2290 def test_array_from_pylist_offset_overflow():
2291     # Regression test for ARROW-12983
2292     # Offset buffer overflow - should result in chunked array
2293     # Note this doesn't apply to primitive arrays
2294     items = [b'a'] * (2 ** 31)
2295     arr = pa.array(items, type=pa.string())
2296     assert isinstance(arr, pa.ChunkedArray)
2297     assert len(arr) == 2**31
2298     assert len(arr.chunks) > 1
2299
2300     mask = np.zeros(2**31, bool)
2301     arr = pa.array(items, mask=mask, type=pa.string())
2302     assert isinstance(arr, pa.ChunkedArray)
2303     assert len(arr) == 2**31
2304     assert len(arr.chunks) > 1
2305
2306     arr = pa.array(items, type=pa.binary())
2307     assert isinstance(arr, pa.ChunkedArray)
2308     assert len(arr) == 2**31
2309     assert len(arr.chunks) > 1