]>
Commit | Line | Data |
---|---|---|
1 | # Licensed to the Apache Software Foundation (ASF) under one | |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | import collections | |
19 | import datetime | |
20 | import decimal | |
21 | import itertools | |
22 | import math | |
23 | import re | |
24 | ||
25 | import hypothesis as h | |
26 | import numpy as np | |
27 | import pytz | |
28 | import pytest | |
29 | ||
30 | from pyarrow.pandas_compat import _pandas_api # noqa | |
31 | import pyarrow as pa | |
32 | import pyarrow.tests.strategies as past | |
33 | ||
34 | ||
35 | int_type_pairs = [ | |
36 | (np.int8, pa.int8()), | |
37 | (np.int16, pa.int16()), | |
38 | (np.int32, pa.int32()), | |
39 | (np.int64, pa.int64()), | |
40 | (np.uint8, pa.uint8()), | |
41 | (np.uint16, pa.uint16()), | |
42 | (np.uint32, pa.uint32()), | |
43 | (np.uint64, pa.uint64())] | |
44 | ||
45 | ||
46 | np_int_types, pa_int_types = zip(*int_type_pairs) | |
47 | ||
48 | ||
49 | class StrangeIterable: | |
50 | def __init__(self, lst): | |
51 | self.lst = lst | |
52 | ||
53 | def __iter__(self): | |
54 | return self.lst.__iter__() | |
55 | ||
56 | ||
57 | class MyInt: | |
58 | def __init__(self, value): | |
59 | self.value = value | |
60 | ||
61 | def __int__(self): | |
62 | return self.value | |
63 | ||
64 | ||
65 | class MyBrokenInt: | |
66 | def __int__(self): | |
67 | 1/0 # MARKER | |
68 | ||
69 | ||
70 | def check_struct_type(ty, expected): | |
71 | """ | |
72 | Check a struct type is as expected, but not taking order into account. | |
73 | """ | |
74 | assert pa.types.is_struct(ty) | |
75 | assert set(ty) == set(expected) | |
76 | ||
77 | ||
78 | def test_iterable_types(): | |
79 | arr1 = pa.array(StrangeIterable([0, 1, 2, 3])) | |
80 | arr2 = pa.array((0, 1, 2, 3)) | |
81 | ||
82 | assert arr1.equals(arr2) | |
83 | ||
84 | ||
85 | def test_empty_iterable(): | |
86 | arr = pa.array(StrangeIterable([])) | |
87 | assert len(arr) == 0 | |
88 | assert arr.null_count == 0 | |
89 | assert arr.type == pa.null() | |
90 | assert arr.to_pylist() == [] | |
91 | ||
92 | ||
93 | def test_limited_iterator_types(): | |
94 | arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3) | |
95 | arr2 = pa.array((0, 1, 2)) | |
96 | assert arr1.equals(arr2) | |
97 | ||
98 | ||
99 | def test_limited_iterator_size_overflow(): | |
100 | arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2) | |
101 | arr2 = pa.array((0, 1)) | |
102 | assert arr1.equals(arr2) | |
103 | ||
104 | ||
105 | def test_limited_iterator_size_underflow(): | |
106 | arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10) | |
107 | arr2 = pa.array((0, 1, 2)) | |
108 | assert arr1.equals(arr2) | |
109 | ||
110 | ||
111 | def test_iterator_without_size(): | |
112 | expected = pa.array((0, 1, 2)) | |
113 | arr1 = pa.array(iter(range(3))) | |
114 | assert arr1.equals(expected) | |
115 | # Same with explicit type | |
116 | arr1 = pa.array(iter(range(3)), type=pa.int64()) | |
117 | assert arr1.equals(expected) | |
118 | ||
119 | ||
120 | def test_infinite_iterator(): | |
121 | expected = pa.array((0, 1, 2)) | |
122 | arr1 = pa.array(itertools.count(0), size=3) | |
123 | assert arr1.equals(expected) | |
124 | # Same with explicit type | |
125 | arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3) | |
126 | assert arr1.equals(expected) | |
127 | ||
128 | ||
129 | def _as_list(xs): | |
130 | return xs | |
131 | ||
132 | ||
133 | def _as_tuple(xs): | |
134 | return tuple(xs) | |
135 | ||
136 | ||
137 | def _as_deque(xs): | |
138 | # deque is a sequence while neither tuple nor list | |
139 | return collections.deque(xs) | |
140 | ||
141 | ||
142 | def _as_dict_values(xs): | |
143 | # a dict values object is not a sequence, just a regular iterable | |
144 | dct = {k: v for k, v in enumerate(xs)} | |
145 | return dct.values() | |
146 | ||
147 | ||
148 | def _as_numpy_array(xs): | |
149 | arr = np.empty(len(xs), dtype=object) | |
150 | arr[:] = xs | |
151 | return arr | |
152 | ||
153 | ||
154 | def _as_set(xs): | |
155 | return set(xs) | |
156 | ||
157 | ||
158 | SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array] | |
159 | ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES | |
160 | COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES | |
161 | ||
162 | parametrize_with_iterable_types = pytest.mark.parametrize( | |
163 | "seq", ITERABLE_TYPES | |
164 | ) | |
165 | ||
166 | parametrize_with_sequence_types = pytest.mark.parametrize( | |
167 | "seq", SEQUENCE_TYPES | |
168 | ) | |
169 | ||
170 | parametrize_with_collections_types = pytest.mark.parametrize( | |
171 | "seq", COLLECTIONS_TYPES | |
172 | ) | |
173 | ||
174 | ||
175 | @parametrize_with_collections_types | |
176 | def test_sequence_types(seq): | |
177 | arr1 = pa.array(seq([1, 2, 3])) | |
178 | arr2 = pa.array([1, 2, 3]) | |
179 | ||
180 | assert arr1.equals(arr2) | |
181 | ||
182 | ||
183 | @parametrize_with_iterable_types | |
184 | def test_nested_sequence_types(seq): | |
185 | arr1 = pa.array([seq([1, 2, 3])]) | |
186 | arr2 = pa.array([[1, 2, 3]]) | |
187 | ||
188 | assert arr1.equals(arr2) | |
189 | ||
190 | ||
191 | @parametrize_with_sequence_types | |
192 | def test_sequence_boolean(seq): | |
193 | expected = [True, None, False, None] | |
194 | arr = pa.array(seq(expected)) | |
195 | assert len(arr) == 4 | |
196 | assert arr.null_count == 2 | |
197 | assert arr.type == pa.bool_() | |
198 | assert arr.to_pylist() == expected | |
199 | ||
200 | ||
201 | @parametrize_with_sequence_types | |
202 | def test_sequence_numpy_boolean(seq): | |
203 | expected = [np.bool_(True), None, np.bool_(False), None] | |
204 | arr = pa.array(seq(expected)) | |
205 | assert arr.type == pa.bool_() | |
206 | assert arr.to_pylist() == [True, None, False, None] | |
207 | ||
208 | ||
209 | @parametrize_with_sequence_types | |
210 | def test_sequence_mixed_numpy_python_bools(seq): | |
211 | values = np.array([True, False]) | |
212 | arr = pa.array(seq([values[0], None, values[1], True, False])) | |
213 | assert arr.type == pa.bool_() | |
214 | assert arr.to_pylist() == [True, None, False, True, False] | |
215 | ||
216 | ||
217 | @parametrize_with_collections_types | |
218 | def test_empty_list(seq): | |
219 | arr = pa.array(seq([])) | |
220 | assert len(arr) == 0 | |
221 | assert arr.null_count == 0 | |
222 | assert arr.type == pa.null() | |
223 | assert arr.to_pylist() == [] | |
224 | ||
225 | ||
226 | @parametrize_with_sequence_types | |
227 | def test_nested_lists(seq): | |
228 | data = [[], [1, 2], None] | |
229 | arr = pa.array(seq(data)) | |
230 | assert len(arr) == 3 | |
231 | assert arr.null_count == 1 | |
232 | assert arr.type == pa.list_(pa.int64()) | |
233 | assert arr.to_pylist() == data | |
234 | # With explicit type | |
235 | arr = pa.array(seq(data), type=pa.list_(pa.int32())) | |
236 | assert len(arr) == 3 | |
237 | assert arr.null_count == 1 | |
238 | assert arr.type == pa.list_(pa.int32()) | |
239 | assert arr.to_pylist() == data | |
240 | ||
241 | ||
242 | @parametrize_with_sequence_types | |
243 | def test_nested_large_lists(seq): | |
244 | data = [[], [1, 2], None] | |
245 | arr = pa.array(seq(data), type=pa.large_list(pa.int16())) | |
246 | assert len(arr) == 3 | |
247 | assert arr.null_count == 1 | |
248 | assert arr.type == pa.large_list(pa.int16()) | |
249 | assert arr.to_pylist() == data | |
250 | ||
251 | ||
252 | @parametrize_with_collections_types | |
253 | def test_list_with_non_list(seq): | |
254 | # List types don't accept non-sequences | |
255 | with pytest.raises(TypeError): | |
256 | pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64())) | |
257 | with pytest.raises(TypeError): | |
258 | pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64())) | |
259 | ||
260 | ||
261 | @parametrize_with_sequence_types | |
262 | def test_nested_arrays(seq): | |
263 | arr = pa.array(seq([np.array([], dtype=np.int64), | |
264 | np.array([1, 2], dtype=np.int64), None])) | |
265 | assert len(arr) == 3 | |
266 | assert arr.null_count == 1 | |
267 | assert arr.type == pa.list_(pa.int64()) | |
268 | assert arr.to_pylist() == [[], [1, 2], None] | |
269 | ||
270 | ||
271 | @parametrize_with_sequence_types | |
272 | def test_nested_fixed_size_list(seq): | |
273 | # sequence of lists | |
274 | data = [[1, 2], [3, None], None] | |
275 | arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2)) | |
276 | assert len(arr) == 3 | |
277 | assert arr.null_count == 1 | |
278 | assert arr.type == pa.list_(pa.int64(), 2) | |
279 | assert arr.to_pylist() == data | |
280 | ||
281 | # sequence of numpy arrays | |
282 | data = [np.array([1, 2], dtype='int64'), np.array([3, 4], dtype='int64'), | |
283 | None] | |
284 | arr = pa.array(seq(data), type=pa.list_(pa.int64(), 2)) | |
285 | assert len(arr) == 3 | |
286 | assert arr.null_count == 1 | |
287 | assert arr.type == pa.list_(pa.int64(), 2) | |
288 | assert arr.to_pylist() == [[1, 2], [3, 4], None] | |
289 | ||
290 | # incorrect length of the lists or arrays | |
291 | data = [[1, 2, 4], [3, None], None] | |
292 | for data in [[[1, 2, 3]], [np.array([1, 2, 4], dtype='int64')]]: | |
293 | with pytest.raises( | |
294 | ValueError, match="Length of item not correct: expected 2"): | |
295 | pa.array(seq(data), type=pa.list_(pa.int64(), 2)) | |
296 | ||
297 | # with list size of 0 | |
298 | data = [[], [], None] | |
299 | arr = pa.array(seq(data), type=pa.list_(pa.int64(), 0)) | |
300 | assert len(arr) == 3 | |
301 | assert arr.null_count == 1 | |
302 | assert arr.type == pa.list_(pa.int64(), 0) | |
303 | assert arr.to_pylist() == [[], [], None] | |
304 | ||
305 | ||
306 | @parametrize_with_sequence_types | |
307 | def test_sequence_all_none(seq): | |
308 | arr = pa.array(seq([None, None])) | |
309 | assert len(arr) == 2 | |
310 | assert arr.null_count == 2 | |
311 | assert arr.type == pa.null() | |
312 | assert arr.to_pylist() == [None, None] | |
313 | ||
314 | ||
315 | @parametrize_with_sequence_types | |
316 | @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) | |
317 | def test_sequence_integer(seq, np_scalar_pa_type): | |
318 | np_scalar, pa_type = np_scalar_pa_type | |
319 | expected = [1, None, 3, None, | |
320 | np.iinfo(np_scalar).min, np.iinfo(np_scalar).max] | |
321 | arr = pa.array(seq(expected), type=pa_type) | |
322 | assert len(arr) == 6 | |
323 | assert arr.null_count == 2 | |
324 | assert arr.type == pa_type | |
325 | assert arr.to_pylist() == expected | |
326 | ||
327 | ||
328 | @parametrize_with_collections_types | |
329 | @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) | |
330 | def test_sequence_integer_np_nan(seq, np_scalar_pa_type): | |
331 | # ARROW-2806: numpy.nan is a double value and thus should produce | |
332 | # a double array. | |
333 | _, pa_type = np_scalar_pa_type | |
334 | with pytest.raises(ValueError): | |
335 | pa.array(seq([np.nan]), type=pa_type, from_pandas=False) | |
336 | ||
337 | arr = pa.array(seq([np.nan]), type=pa_type, from_pandas=True) | |
338 | expected = [None] | |
339 | assert len(arr) == 1 | |
340 | assert arr.null_count == 1 | |
341 | assert arr.type == pa_type | |
342 | assert arr.to_pylist() == expected | |
343 | ||
344 | ||
345 | @parametrize_with_sequence_types | |
346 | @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) | |
347 | def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type): | |
348 | # ARROW-2806: numpy.nan is a double value and thus should produce | |
349 | # a double array. | |
350 | _, pa_type = np_scalar_pa_type | |
351 | with pytest.raises(ValueError): | |
352 | pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False) | |
353 | ||
354 | arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True) | |
355 | expected = [[None]] | |
356 | assert len(arr) == 1 | |
357 | assert arr.null_count == 0 | |
358 | assert arr.type == pa.list_(pa_type) | |
359 | assert arr.to_pylist() == expected | |
360 | ||
361 | ||
362 | @parametrize_with_sequence_types | |
363 | def test_sequence_integer_inferred(seq): | |
364 | expected = [1, None, 3, None] | |
365 | arr = pa.array(seq(expected)) | |
366 | assert len(arr) == 4 | |
367 | assert arr.null_count == 2 | |
368 | assert arr.type == pa.int64() | |
369 | assert arr.to_pylist() == expected | |
370 | ||
371 | ||
372 | @parametrize_with_sequence_types | |
373 | @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) | |
374 | def test_sequence_numpy_integer(seq, np_scalar_pa_type): | |
375 | np_scalar, pa_type = np_scalar_pa_type | |
376 | expected = [np_scalar(1), None, np_scalar(3), None, | |
377 | np_scalar(np.iinfo(np_scalar).min), | |
378 | np_scalar(np.iinfo(np_scalar).max)] | |
379 | arr = pa.array(seq(expected), type=pa_type) | |
380 | assert len(arr) == 6 | |
381 | assert arr.null_count == 2 | |
382 | assert arr.type == pa_type | |
383 | assert arr.to_pylist() == expected | |
384 | ||
385 | ||
386 | @parametrize_with_sequence_types | |
387 | @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) | |
388 | def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type): | |
389 | np_scalar, pa_type = np_scalar_pa_type | |
390 | expected = [np_scalar(1), None, np_scalar(3), None] | |
391 | expected += [np_scalar(np.iinfo(np_scalar).min), | |
392 | np_scalar(np.iinfo(np_scalar).max)] | |
393 | arr = pa.array(seq(expected)) | |
394 | assert len(arr) == 6 | |
395 | assert arr.null_count == 2 | |
396 | assert arr.type == pa_type | |
397 | assert arr.to_pylist() == expected | |
398 | ||
399 | ||
400 | @parametrize_with_sequence_types | |
401 | def test_sequence_custom_integers(seq): | |
402 | expected = [0, 42, 2**33 + 1, -2**63] | |
403 | data = list(map(MyInt, expected)) | |
404 | arr = pa.array(seq(data), type=pa.int64()) | |
405 | assert arr.to_pylist() == expected | |
406 | ||
407 | ||
408 | @parametrize_with_collections_types | |
409 | def test_broken_integers(seq): | |
410 | data = [MyBrokenInt()] | |
411 | with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): | |
412 | pa.array(seq(data), type=pa.int64()) | |
413 | ||
414 | ||
415 | def test_numpy_scalars_mixed_type(): | |
416 | # ARROW-4324 | |
417 | data = [np.int32(10), np.float32(0.5)] | |
418 | arr = pa.array(data) | |
419 | expected = pa.array([10, 0.5], type="float64") | |
420 | assert arr.equals(expected) | |
421 | ||
422 | # ARROW-9490 | |
423 | data = [np.int8(10), np.float32(0.5)] | |
424 | arr = pa.array(data) | |
425 | expected = pa.array([10, 0.5], type="float32") | |
426 | assert arr.equals(expected) | |
427 | ||
428 | ||
429 | @pytest.mark.xfail(reason="Type inference for uint64 not implemented", | |
430 | raises=OverflowError) | |
431 | def test_uint64_max_convert(): | |
432 | data = [0, np.iinfo(np.uint64).max] | |
433 | ||
434 | arr = pa.array(data, type=pa.uint64()) | |
435 | expected = pa.array(np.array(data, dtype='uint64')) | |
436 | assert arr.equals(expected) | |
437 | ||
438 | arr_inferred = pa.array(data) | |
439 | assert arr_inferred.equals(expected) | |
440 | ||
441 | ||
442 | @pytest.mark.parametrize("bits", [8, 16, 32, 64]) | |
443 | def test_signed_integer_overflow(bits): | |
444 | ty = getattr(pa, "int%d" % bits)() | |
445 | # XXX ideally would always raise OverflowError | |
446 | with pytest.raises((OverflowError, pa.ArrowInvalid)): | |
447 | pa.array([2 ** (bits - 1)], ty) | |
448 | with pytest.raises((OverflowError, pa.ArrowInvalid)): | |
449 | pa.array([-2 ** (bits - 1) - 1], ty) | |
450 | ||
451 | ||
452 | @pytest.mark.parametrize("bits", [8, 16, 32, 64]) | |
453 | def test_unsigned_integer_overflow(bits): | |
454 | ty = getattr(pa, "uint%d" % bits)() | |
455 | # XXX ideally would always raise OverflowError | |
456 | with pytest.raises((OverflowError, pa.ArrowInvalid)): | |
457 | pa.array([2 ** bits], ty) | |
458 | with pytest.raises((OverflowError, pa.ArrowInvalid)): | |
459 | pa.array([-1], ty) | |
460 | ||
461 | ||
462 | @parametrize_with_collections_types | |
463 | @pytest.mark.parametrize("typ", pa_int_types) | |
464 | def test_integer_from_string_error(seq, typ): | |
465 | # ARROW-9451: pa.array(['1'], type=pa.uint32()) should not succeed | |
466 | with pytest.raises(pa.ArrowInvalid): | |
467 | pa.array(seq(['1']), type=typ) | |
468 | ||
469 | ||
470 | def test_convert_with_mask(): | |
471 | data = [1, 2, 3, 4, 5] | |
472 | mask = np.array([False, True, False, False, True]) | |
473 | ||
474 | result = pa.array(data, mask=mask) | |
475 | expected = pa.array([1, None, 3, 4, None]) | |
476 | ||
477 | assert result.equals(expected) | |
478 | ||
479 | # Mask wrong length | |
480 | with pytest.raises(ValueError): | |
481 | pa.array(data, mask=mask[1:]) | |
482 | ||
483 | ||
484 | def test_garbage_collection(): | |
485 | import gc | |
486 | ||
487 | # Force the cyclic garbage collector to run | |
488 | gc.collect() | |
489 | ||
490 | bytes_before = pa.total_allocated_bytes() | |
491 | pa.array([1, None, 3, None]) | |
492 | gc.collect() | |
493 | assert pa.total_allocated_bytes() == bytes_before | |
494 | ||
495 | ||
496 | def test_sequence_double(): | |
497 | data = [1.5, 1., None, 2.5, None, None] | |
498 | arr = pa.array(data) | |
499 | assert len(arr) == 6 | |
500 | assert arr.null_count == 3 | |
501 | assert arr.type == pa.float64() | |
502 | assert arr.to_pylist() == data | |
503 | ||
504 | ||
505 | def test_double_auto_coerce_from_integer(): | |
506 | # Done as part of ARROW-2814 | |
507 | data = [1.5, 1., None, 2.5, None, None] | |
508 | arr = pa.array(data) | |
509 | ||
510 | data2 = [1.5, 1, None, 2.5, None, None] | |
511 | arr2 = pa.array(data2) | |
512 | ||
513 | assert arr.equals(arr2) | |
514 | ||
515 | data3 = [1, 1.5, None, 2.5, None, None] | |
516 | arr3 = pa.array(data3) | |
517 | ||
518 | data4 = [1., 1.5, None, 2.5, None, None] | |
519 | arr4 = pa.array(data4) | |
520 | ||
521 | assert arr3.equals(arr4) | |
522 | ||
523 | ||
524 | def test_double_integer_coerce_representable_range(): | |
525 | valid_values = [1.5, 1, 2, None, 1 << 53, -(1 << 53)] | |
526 | invalid_values = [1.5, 1, 2, None, (1 << 53) + 1] | |
527 | invalid_values2 = [1.5, 1, 2, None, -((1 << 53) + 1)] | |
528 | ||
529 | # it works | |
530 | pa.array(valid_values) | |
531 | ||
532 | # it fails | |
533 | with pytest.raises(ValueError): | |
534 | pa.array(invalid_values) | |
535 | ||
536 | with pytest.raises(ValueError): | |
537 | pa.array(invalid_values2) | |
538 | ||
539 | ||
540 | def test_float32_integer_coerce_representable_range(): | |
541 | f32 = np.float32 | |
542 | valid_values = [f32(1.5), 1 << 24, -(1 << 24)] | |
543 | invalid_values = [f32(1.5), (1 << 24) + 1] | |
544 | invalid_values2 = [f32(1.5), -((1 << 24) + 1)] | |
545 | ||
546 | # it works | |
547 | pa.array(valid_values, type=pa.float32()) | |
548 | ||
549 | # it fails | |
550 | with pytest.raises(ValueError): | |
551 | pa.array(invalid_values, type=pa.float32()) | |
552 | ||
553 | with pytest.raises(ValueError): | |
554 | pa.array(invalid_values2, type=pa.float32()) | |
555 | ||
556 | ||
557 | def test_mixed_sequence_errors(): | |
558 | with pytest.raises(ValueError, match="tried to convert to boolean"): | |
559 | pa.array([True, 'foo'], type=pa.bool_()) | |
560 | ||
561 | with pytest.raises(ValueError, match="tried to convert to float32"): | |
562 | pa.array([1.5, 'foo'], type=pa.float32()) | |
563 | ||
564 | with pytest.raises(ValueError, match="tried to convert to double"): | |
565 | pa.array([1.5, 'foo']) | |
566 | ||
567 | ||
568 | @parametrize_with_sequence_types | |
569 | @pytest.mark.parametrize("np_scalar,pa_type", [ | |
570 | (np.float16, pa.float16()), | |
571 | (np.float32, pa.float32()), | |
572 | (np.float64, pa.float64()) | |
573 | ]) | |
574 | @pytest.mark.parametrize("from_pandas", [True, False]) | |
575 | def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas): | |
576 | data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan] | |
577 | arr = pa.array(seq(data), from_pandas=from_pandas) | |
578 | assert len(arr) == 6 | |
579 | if from_pandas: | |
580 | assert arr.null_count == 3 | |
581 | else: | |
582 | assert arr.null_count == 2 | |
583 | if from_pandas: | |
584 | # The NaN is skipped in type inference, otherwise it forces a | |
585 | # float64 promotion | |
586 | assert arr.type == pa_type | |
587 | else: | |
588 | assert arr.type == pa.float64() | |
589 | ||
590 | assert arr.to_pylist()[:4] == data[:4] | |
591 | if from_pandas: | |
592 | assert arr.to_pylist()[5] is None | |
593 | else: | |
594 | assert np.isnan(arr.to_pylist()[5]) | |
595 | ||
596 | ||
597 | @pytest.mark.parametrize("from_pandas", [True, False]) | |
598 | @pytest.mark.parametrize("inner_seq", [np.array, list]) | |
599 | def test_ndarray_nested_numpy_double(from_pandas, inner_seq): | |
600 | # ARROW-2806 | |
601 | data = np.array([ | |
602 | inner_seq([1., 2.]), | |
603 | inner_seq([1., 2., 3.]), | |
604 | inner_seq([np.nan]), | |
605 | None | |
606 | ], dtype=object) | |
607 | arr = pa.array(data, from_pandas=from_pandas) | |
608 | assert len(arr) == 4 | |
609 | assert arr.null_count == 1 | |
610 | assert arr.type == pa.list_(pa.float64()) | |
611 | if from_pandas: | |
612 | assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None] | |
613 | else: | |
614 | np.testing.assert_equal(arr.to_pylist(), | |
615 | [[1., 2.], [1., 2., 3.], [np.nan], None]) | |
616 | ||
617 | ||
618 | def test_nested_ndarray_in_object_array(): | |
619 | # ARROW-4350 | |
620 | arr = np.empty(2, dtype=object) | |
621 | arr[:] = [np.array([1, 2], dtype=np.int64), | |
622 | np.array([2, 3], dtype=np.int64)] | |
623 | ||
624 | arr2 = np.empty(2, dtype=object) | |
625 | arr2[0] = [3, 4] | |
626 | arr2[1] = [5, 6] | |
627 | ||
628 | expected_type = pa.list_(pa.list_(pa.int64())) | |
629 | assert pa.infer_type([arr]) == expected_type | |
630 | ||
631 | result = pa.array([arr, arr2]) | |
632 | expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]], | |
633 | type=expected_type) | |
634 | ||
635 | assert result.equals(expected) | |
636 | ||
637 | # test case for len-1 arrays to ensure they are interpreted as | |
638 | # sublists and not scalars | |
639 | arr = np.empty(2, dtype=object) | |
640 | arr[:] = [np.array([1]), np.array([2])] | |
641 | result = pa.array([arr, arr]) | |
642 | assert result.to_pylist() == [[[1], [2]], [[1], [2]]] | |
643 | ||
644 | ||
645 | @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray " | |
646 | "not yet implemented"), | |
647 | raises=AssertionError) | |
648 | def test_multidimensional_ndarray_as_nested_list(): | |
649 | # TODO(wesm): see ARROW-5645 | |
650 | arr = np.array([[1, 2], [2, 3]], dtype=np.int64) | |
651 | arr2 = np.array([[3, 4], [5, 6]], dtype=np.int64) | |
652 | ||
653 | expected_type = pa.list_(pa.list_(pa.int64())) | |
654 | assert pa.infer_type([arr]) == expected_type | |
655 | ||
656 | result = pa.array([arr, arr2]) | |
657 | expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]], | |
658 | type=expected_type) | |
659 | ||
660 | assert result.equals(expected) | |
661 | ||
662 | ||
663 | @pytest.mark.parametrize(('data', 'value_type'), [ | |
664 | ([True, False], pa.bool_()), | |
665 | ([None, None], pa.null()), | |
666 | ([1, 2, None], pa.int8()), | |
667 | ([1, 2., 3., None], pa.float32()), | |
668 | ([datetime.date.today(), None], pa.date32()), | |
669 | ([None, datetime.date.today()], pa.date64()), | |
670 | ([datetime.time(1, 1, 1), None], pa.time32('s')), | |
671 | ([None, datetime.time(2, 2, 2)], pa.time64('us')), | |
672 | ([datetime.datetime.now(), None], pa.timestamp('us')), | |
673 | ([datetime.timedelta(seconds=10)], pa.duration('s')), | |
674 | ([b"a", b"b"], pa.binary()), | |
675 | ([b"aaa", b"bbb", b"ccc"], pa.binary(3)), | |
676 | ([b"a", b"b", b"c"], pa.large_binary()), | |
677 | (["a", "b", "c"], pa.string()), | |
678 | (["a", "b", "c"], pa.large_string()), | |
679 | ( | |
680 | [{"a": 1, "b": 2}, None, {"a": 5, "b": None}], | |
681 | pa.struct([('a', pa.int8()), ('b', pa.int16())]) | |
682 | ) | |
683 | ]) | |
684 | def test_list_array_from_object_ndarray(data, value_type): | |
685 | ty = pa.list_(value_type) | |
686 | ndarray = np.array(data, dtype=object) | |
687 | arr = pa.array([ndarray], type=ty) | |
688 | assert arr.type.equals(ty) | |
689 | assert arr.to_pylist() == [data] | |
690 | ||
691 | ||
692 | @pytest.mark.parametrize(('data', 'value_type'), [ | |
693 | ([[1, 2], [3]], pa.list_(pa.int64())), | |
694 | ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)), | |
695 | ([[1], [2, 3]], pa.large_list(pa.int64())) | |
696 | ]) | |
697 | def test_nested_list_array_from_object_ndarray(data, value_type): | |
698 | ndarray = np.empty(len(data), dtype=object) | |
699 | ndarray[:] = [np.array(item, dtype=object) for item in data] | |
700 | ||
701 | ty = pa.list_(value_type) | |
702 | arr = pa.array([ndarray], type=ty) | |
703 | assert arr.type.equals(ty) | |
704 | assert arr.to_pylist() == [data] | |
705 | ||
706 | ||
707 | def test_array_ignore_nan_from_pandas(): | |
708 | # See ARROW-4324, this reverts logic that was introduced in | |
709 | # ARROW-2240 | |
710 | with pytest.raises(ValueError): | |
711 | pa.array([np.nan, 'str']) | |
712 | ||
713 | arr = pa.array([np.nan, 'str'], from_pandas=True) | |
714 | expected = pa.array([None, 'str']) | |
715 | assert arr.equals(expected) | |
716 | ||
717 | ||
718 | def test_nested_ndarray_different_dtypes(): | |
719 | data = [ | |
720 | np.array([1, 2, 3], dtype='int64'), | |
721 | None, | |
722 | np.array([4, 5, 6], dtype='uint32') | |
723 | ] | |
724 | ||
725 | arr = pa.array(data) | |
726 | expected = pa.array([[1, 2, 3], None, [4, 5, 6]], | |
727 | type=pa.list_(pa.int64())) | |
728 | assert arr.equals(expected) | |
729 | ||
730 | t2 = pa.list_(pa.uint32()) | |
731 | arr2 = pa.array(data, type=t2) | |
732 | expected2 = expected.cast(t2) | |
733 | assert arr2.equals(expected2) | |
734 | ||
735 | ||
736 | def test_sequence_unicode(): | |
737 | data = ['foo', 'bar', None, 'mañana'] | |
738 | arr = pa.array(data) | |
739 | assert len(arr) == 4 | |
740 | assert arr.null_count == 1 | |
741 | assert arr.type == pa.string() | |
742 | assert arr.to_pylist() == data | |
743 | ||
744 | ||
745 | def check_array_mixed_unicode_bytes(binary_type, string_type): | |
746 | values = ['qux', b'foo', bytearray(b'barz')] | |
747 | b_values = [b'qux', b'foo', b'barz'] | |
748 | u_values = ['qux', 'foo', 'barz'] | |
749 | ||
750 | arr = pa.array(values) | |
751 | expected = pa.array(b_values, type=pa.binary()) | |
752 | assert arr.type == pa.binary() | |
753 | assert arr.equals(expected) | |
754 | ||
755 | arr = pa.array(values, type=binary_type) | |
756 | expected = pa.array(b_values, type=binary_type) | |
757 | assert arr.type == binary_type | |
758 | assert arr.equals(expected) | |
759 | ||
760 | arr = pa.array(values, type=string_type) | |
761 | expected = pa.array(u_values, type=string_type) | |
762 | assert arr.type == string_type | |
763 | assert arr.equals(expected) | |
764 | ||
765 | ||
766 | def test_array_mixed_unicode_bytes(): | |
767 | check_array_mixed_unicode_bytes(pa.binary(), pa.string()) | |
768 | check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) | |
769 | ||
770 | ||
771 | @pytest.mark.large_memory | |
772 | @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()]) | |
773 | def test_large_binary_array(ty): | |
774 | # Construct a large binary array with more than 4GB of data | |
775 | s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10 | |
776 | nrepeats = math.ceil((2**32 + 5) / len(s)) | |
777 | data = [s] * nrepeats | |
778 | arr = pa.array(data, type=ty) | |
779 | assert isinstance(arr, pa.Array) | |
780 | assert arr.type == ty | |
781 | assert len(arr) == nrepeats | |
782 | ||
783 | ||
784 | @pytest.mark.slow | |
785 | @pytest.mark.large_memory | |
786 | @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()]) | |
787 | def test_large_binary_value(ty): | |
788 | # Construct a large binary array with a single value larger than 4GB | |
789 | s = b"0123456789abcdefghijklmnopqrstuvwxyz" | |
790 | nrepeats = math.ceil((2**32 + 5) / len(s)) | |
791 | arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty) | |
792 | assert isinstance(arr, pa.Array) | |
793 | assert arr.type == ty | |
794 | assert len(arr) == 4 | |
795 | buf = arr[1].as_buffer() | |
796 | assert len(buf) == len(s) * nrepeats | |
797 | ||
798 | ||
799 | @pytest.mark.large_memory | |
800 | @pytest.mark.parametrize("ty", [pa.binary(), pa.string()]) | |
801 | def test_string_too_large(ty): | |
802 | # Construct a binary array with a single value larger than 4GB | |
803 | s = b"0123456789abcdefghijklmnopqrstuvwxyz" | |
804 | nrepeats = math.ceil((2**32 + 5) / len(s)) | |
805 | with pytest.raises(pa.ArrowCapacityError): | |
806 | pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty) | |
807 | ||
808 | ||
809 | def test_sequence_bytes(): | |
810 | u1 = b'ma\xc3\xb1ana' | |
811 | ||
812 | data = [b'foo', | |
813 | memoryview(b'dada'), | |
814 | memoryview(b'd-a-t-a')[::2], # non-contiguous is made contiguous | |
815 | u1.decode('utf-8'), # unicode gets encoded, | |
816 | bytearray(b'bar'), | |
817 | None] | |
818 | for ty in [None, pa.binary(), pa.large_binary()]: | |
819 | arr = pa.array(data, type=ty) | |
820 | assert len(arr) == 6 | |
821 | assert arr.null_count == 1 | |
822 | assert arr.type == ty or pa.binary() | |
823 | assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None] | |
824 | ||
825 | ||
826 | @pytest.mark.parametrize("ty", [pa.string(), pa.large_string()]) | |
827 | def test_sequence_utf8_to_unicode(ty): | |
828 | # ARROW-1225 | |
829 | data = [b'foo', None, b'bar'] | |
830 | arr = pa.array(data, type=ty) | |
831 | assert arr.type == ty | |
832 | assert arr[0].as_py() == 'foo' | |
833 | ||
834 | # test a non-utf8 unicode string | |
835 | val = ('mañana').encode('utf-16-le') | |
836 | with pytest.raises(pa.ArrowInvalid): | |
837 | pa.array([val], type=ty) | |
838 | ||
839 | ||
840 | def test_sequence_fixed_size_bytes(): | |
841 | data = [b'foof', None, bytearray(b'barb'), b'2346'] | |
842 | arr = pa.array(data, type=pa.binary(4)) | |
843 | assert len(arr) == 4 | |
844 | assert arr.null_count == 1 | |
845 | assert arr.type == pa.binary(4) | |
846 | assert arr.to_pylist() == [b'foof', None, b'barb', b'2346'] | |
847 | ||
848 | ||
849 | def test_fixed_size_bytes_does_not_accept_varying_lengths(): | |
850 | data = [b'foo', None, b'barb', b'2346'] | |
851 | with pytest.raises(pa.ArrowInvalid): | |
852 | pa.array(data, type=pa.binary(4)) | |
853 | ||
854 | ||
855 | def test_fixed_size_binary_length_check(): | |
856 | # ARROW-10193 | |
857 | data = [b'\x19h\r\x9e\x00\x00\x00\x00\x01\x9b\x9fA'] | |
858 | assert len(data[0]) == 12 | |
859 | ty = pa.binary(12) | |
860 | arr = pa.array(data, type=ty) | |
861 | assert arr.to_pylist() == data | |
862 | ||
863 | ||
864 | def test_sequence_date(): | |
865 | data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), | |
866 | datetime.date(2040, 2, 26)] | |
867 | arr = pa.array(data) | |
868 | assert len(arr) == 4 | |
869 | assert arr.type == pa.date32() | |
870 | assert arr.null_count == 1 | |
871 | assert arr[0].as_py() == datetime.date(2000, 1, 1) | |
872 | assert arr[1].as_py() is None | |
873 | assert arr[2].as_py() == datetime.date(1970, 1, 1) | |
874 | assert arr[3].as_py() == datetime.date(2040, 2, 26) | |
875 | ||
876 | ||
877 | @pytest.mark.parametrize('input', | |
878 | [(pa.date32(), [10957, None]), | |
879 | (pa.date64(), [10957 * 86400000, None])]) | |
880 | def test_sequence_explicit_types(input): | |
881 | t, ex_values = input | |
882 | data = [datetime.date(2000, 1, 1), None] | |
883 | arr = pa.array(data, type=t) | |
884 | arr2 = pa.array(ex_values, type=t) | |
885 | ||
886 | for x in [arr, arr2]: | |
887 | assert len(x) == 2 | |
888 | assert x.type == t | |
889 | assert x.null_count == 1 | |
890 | assert x[0].as_py() == datetime.date(2000, 1, 1) | |
891 | assert x[1].as_py() is None | |
892 | ||
893 | ||
894 | def test_date32_overflow(): | |
895 | # Overflow | |
896 | data3 = [2**32, None] | |
897 | with pytest.raises((OverflowError, pa.ArrowException)): | |
898 | pa.array(data3, type=pa.date32()) | |
899 | ||
900 | ||
901 | @pytest.mark.parametrize(('time_type', 'unit', 'int_type'), [ | |
902 | (pa.time32, 's', 'int32'), | |
903 | (pa.time32, 'ms', 'int32'), | |
904 | (pa.time64, 'us', 'int64'), | |
905 | (pa.time64, 'ns', 'int64'), | |
906 | ]) | |
907 | def test_sequence_time_with_timezone(time_type, unit, int_type): | |
908 | def expected_integer_value(t): | |
909 | # only use with utc time object because it doesn't adjust with the | |
910 | # offset | |
911 | units = ['s', 'ms', 'us', 'ns'] | |
912 | multiplier = 10**(units.index(unit) * 3) | |
913 | if t is None: | |
914 | return None | |
915 | seconds = ( | |
916 | t.hour * 3600 + | |
917 | t.minute * 60 + | |
918 | t.second + | |
919 | t.microsecond * 10**-6 | |
920 | ) | |
921 | return int(seconds * multiplier) | |
922 | ||
923 | def expected_time_value(t): | |
924 | # only use with utc time object because it doesn't adjust with the | |
925 | # time objects tzdata | |
926 | if unit == 's': | |
927 | return t.replace(microsecond=0) | |
928 | elif unit == 'ms': | |
929 | return t.replace(microsecond=(t.microsecond // 1000) * 1000) | |
930 | else: | |
931 | return t | |
932 | ||
933 | # only timezone naive times are supported in arrow | |
934 | data = [ | |
935 | datetime.time(8, 23, 34, 123456), | |
936 | datetime.time(5, 0, 0, 1000), | |
937 | None, | |
938 | datetime.time(1, 11, 56, 432539), | |
939 | datetime.time(23, 10, 0, 437699) | |
940 | ] | |
941 | ||
942 | ty = time_type(unit) | |
943 | arr = pa.array(data, type=ty) | |
944 | assert len(arr) == 5 | |
945 | assert arr.type == ty | |
946 | assert arr.null_count == 1 | |
947 | ||
948 | # test that the underlying integers are UTC values | |
949 | values = arr.cast(int_type) | |
950 | expected = list(map(expected_integer_value, data)) | |
951 | assert values.to_pylist() == expected | |
952 | ||
953 | # test that the scalars are datetime.time objects with UTC timezone | |
954 | assert arr[0].as_py() == expected_time_value(data[0]) | |
955 | assert arr[1].as_py() == expected_time_value(data[1]) | |
956 | assert arr[2].as_py() is None | |
957 | assert arr[3].as_py() == expected_time_value(data[3]) | |
958 | assert arr[4].as_py() == expected_time_value(data[4]) | |
959 | ||
960 | def tz(hours, minutes=0): | |
961 | offset = datetime.timedelta(hours=hours, minutes=minutes) | |
962 | return datetime.timezone(offset) | |
963 | ||
964 | ||
965 | def test_sequence_timestamp(): | |
966 | data = [ | |
967 | datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), | |
968 | None, | |
969 | datetime.datetime(2006, 1, 13, 12, 34, 56, 432539), | |
970 | datetime.datetime(2010, 8, 13, 5, 46, 57, 437699) | |
971 | ] | |
972 | arr = pa.array(data) | |
973 | assert len(arr) == 4 | |
974 | assert arr.type == pa.timestamp('us') | |
975 | assert arr.null_count == 1 | |
976 | assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
977 | 23, 34, 123456) | |
978 | assert arr[1].as_py() is None | |
979 | assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, | |
980 | 34, 56, 432539) | |
981 | assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, | |
982 | 46, 57, 437699) | |
983 | ||
984 | ||
985 | @pytest.mark.parametrize('timezone', [ | |
986 | None, | |
987 | 'UTC', | |
988 | 'Etc/GMT-1', | |
989 | 'Europe/Budapest', | |
990 | ]) | |
991 | @pytest.mark.parametrize('unit', [ | |
992 | 's', | |
993 | 'ms', | |
994 | 'us', | |
995 | 'ns' | |
996 | ]) | |
997 | def test_sequence_timestamp_with_timezone(timezone, unit): | |
998 | def expected_integer_value(dt): | |
999 | units = ['s', 'ms', 'us', 'ns'] | |
1000 | multiplier = 10**(units.index(unit) * 3) | |
1001 | if dt is None: | |
1002 | return None | |
1003 | else: | |
1004 | # avoid float precision issues | |
1005 | ts = decimal.Decimal(str(dt.timestamp())) | |
1006 | return int(ts * multiplier) | |
1007 | ||
1008 | def expected_datetime_value(dt): | |
1009 | if dt is None: | |
1010 | return None | |
1011 | ||
1012 | if unit == 's': | |
1013 | dt = dt.replace(microsecond=0) | |
1014 | elif unit == 'ms': | |
1015 | dt = dt.replace(microsecond=(dt.microsecond // 1000) * 1000) | |
1016 | ||
1017 | # adjust the timezone | |
1018 | if timezone is None: | |
1019 | # make datetime timezone unaware | |
1020 | return dt.replace(tzinfo=None) | |
1021 | else: | |
1022 | # convert to the expected timezone | |
1023 | return dt.astimezone(pytz.timezone(timezone)) | |
1024 | ||
1025 | data = [ | |
1026 | datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive | |
1027 | pytz.utc.localize( | |
1028 | datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) | |
1029 | ), | |
1030 | None, | |
1031 | pytz.timezone('US/Eastern').localize( | |
1032 | datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) | |
1033 | ), | |
1034 | pytz.timezone('Europe/Moscow').localize( | |
1035 | datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) | |
1036 | ), | |
1037 | ] | |
1038 | utcdata = [ | |
1039 | pytz.utc.localize(data[0]), | |
1040 | data[1], | |
1041 | None, | |
1042 | data[3].astimezone(pytz.utc), | |
1043 | data[4].astimezone(pytz.utc), | |
1044 | ] | |
1045 | ||
1046 | ty = pa.timestamp(unit, tz=timezone) | |
1047 | arr = pa.array(data, type=ty) | |
1048 | assert len(arr) == 5 | |
1049 | assert arr.type == ty | |
1050 | assert arr.null_count == 1 | |
1051 | ||
1052 | # test that the underlying integers are UTC values | |
1053 | values = arr.cast('int64') | |
1054 | expected = list(map(expected_integer_value, utcdata)) | |
1055 | assert values.to_pylist() == expected | |
1056 | ||
1057 | # test that the scalars are datetimes with the correct timezone | |
1058 | for i in range(len(arr)): | |
1059 | assert arr[i].as_py() == expected_datetime_value(utcdata[i]) | |
1060 | ||
1061 | ||
1062 | @pytest.mark.parametrize('timezone', [ | |
1063 | None, | |
1064 | 'UTC', | |
1065 | 'Etc/GMT-1', | |
1066 | 'Europe/Budapest', | |
1067 | ]) | |
1068 | def test_pyarrow_ignore_timezone_environment_variable(monkeypatch, timezone): | |
1069 | # note that any non-empty value will evaluate to true | |
1070 | monkeypatch.setenv("PYARROW_IGNORE_TIMEZONE", "1") | |
1071 | data = [ | |
1072 | datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive | |
1073 | pytz.utc.localize( | |
1074 | datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) | |
1075 | ), | |
1076 | pytz.timezone('US/Eastern').localize( | |
1077 | datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) | |
1078 | ), | |
1079 | pytz.timezone('Europe/Moscow').localize( | |
1080 | datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) | |
1081 | ), | |
1082 | ] | |
1083 | ||
1084 | expected = [dt.replace(tzinfo=None) for dt in data] | |
1085 | if timezone is not None: | |
1086 | tzinfo = pytz.timezone(timezone) | |
1087 | expected = [tzinfo.fromutc(dt) for dt in expected] | |
1088 | ||
1089 | ty = pa.timestamp('us', tz=timezone) | |
1090 | arr = pa.array(data, type=ty) | |
1091 | assert arr.to_pylist() == expected | |
1092 | ||
1093 | ||
1094 | def test_sequence_timestamp_with_timezone_inference(): | |
1095 | data = [ | |
1096 | datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive | |
1097 | pytz.utc.localize( | |
1098 | datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) | |
1099 | ), | |
1100 | None, | |
1101 | pytz.timezone('US/Eastern').localize( | |
1102 | datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) | |
1103 | ), | |
1104 | pytz.timezone('Europe/Moscow').localize( | |
1105 | datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) | |
1106 | ), | |
1107 | ] | |
1108 | expected = [ | |
1109 | pa.timestamp('us', tz=None), | |
1110 | pa.timestamp('us', tz='UTC'), | |
1111 | pa.timestamp('us', tz=None), | |
1112 | pa.timestamp('us', tz='US/Eastern'), | |
1113 | pa.timestamp('us', tz='Europe/Moscow') | |
1114 | ] | |
1115 | for dt, expected_type in zip(data, expected): | |
1116 | prepended = [dt] + data | |
1117 | arr = pa.array(prepended) | |
1118 | assert arr.type == expected_type | |
1119 | ||
1120 | ||
1121 | @pytest.mark.pandas | |
1122 | def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes(): | |
1123 | import pandas as pd | |
1124 | ||
1125 | data = [ | |
1126 | pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'), | |
1127 | unit='ns'), | |
1128 | datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive | |
1129 | pytz.utc.localize( | |
1130 | datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) | |
1131 | ), | |
1132 | None, | |
1133 | ] | |
1134 | utcdata = [ | |
1135 | data[0].astimezone(pytz.utc), | |
1136 | pytz.utc.localize(data[1]), | |
1137 | data[2].astimezone(pytz.utc), | |
1138 | None, | |
1139 | ] | |
1140 | ||
1141 | arr = pa.array(data) | |
1142 | assert arr.type == pa.timestamp('us', tz='US/Eastern') | |
1143 | ||
1144 | values = arr.cast('int64') | |
1145 | expected = [int(dt.timestamp() * 10**6) if dt else None for dt in utcdata] | |
1146 | assert values.to_pylist() == expected | |
1147 | ||
1148 | ||
1149 | def test_sequence_timestamp_out_of_bounds_nanosecond(): | |
1150 | # https://issues.apache.org/jira/browse/ARROW-9768 | |
1151 | # datetime outside of range supported for nanosecond resolution | |
1152 | data = [datetime.datetime(2262, 4, 12)] | |
1153 | with pytest.raises(ValueError, match="out of bounds"): | |
1154 | pa.array(data, type=pa.timestamp('ns')) | |
1155 | ||
1156 | # with microsecond resolution it works fine | |
1157 | arr = pa.array(data, type=pa.timestamp('us')) | |
1158 | assert arr.to_pylist() == data | |
1159 | ||
1160 | # case where the naive is within bounds, but converted to UTC not | |
1161 | tz = datetime.timezone(datetime.timedelta(hours=-1)) | |
1162 | data = [datetime.datetime(2262, 4, 11, 23, tzinfo=tz)] | |
1163 | with pytest.raises(ValueError, match="out of bounds"): | |
1164 | pa.array(data, type=pa.timestamp('ns')) | |
1165 | ||
1166 | arr = pa.array(data, type=pa.timestamp('us')) | |
1167 | assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12) | |
1168 | ||
1169 | ||
1170 | def test_sequence_numpy_timestamp(): | |
1171 | data = [ | |
1172 | np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), | |
1173 | None, | |
1174 | np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)), | |
1175 | np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)) | |
1176 | ] | |
1177 | arr = pa.array(data) | |
1178 | assert len(arr) == 4 | |
1179 | assert arr.type == pa.timestamp('us') | |
1180 | assert arr.null_count == 1 | |
1181 | assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
1182 | 23, 34, 123456) | |
1183 | assert arr[1].as_py() is None | |
1184 | assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, | |
1185 | 34, 56, 432539) | |
1186 | assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, | |
1187 | 46, 57, 437699) | |
1188 | ||
1189 | ||
1190 | class MyDate(datetime.date): | |
1191 | pass | |
1192 | ||
1193 | ||
1194 | class MyDatetime(datetime.datetime): | |
1195 | pass | |
1196 | ||
1197 | ||
1198 | class MyTimedelta(datetime.timedelta): | |
1199 | pass | |
1200 | ||
1201 | ||
1202 | def test_datetime_subclassing(): | |
1203 | data = [ | |
1204 | MyDate(2007, 7, 13), | |
1205 | ] | |
1206 | date_type = pa.date32() | |
1207 | arr_date = pa.array(data, type=date_type) | |
1208 | assert len(arr_date) == 1 | |
1209 | assert arr_date.type == date_type | |
1210 | assert arr_date[0].as_py() == datetime.date(2007, 7, 13) | |
1211 | ||
1212 | data = [ | |
1213 | MyDatetime(2007, 7, 13, 1, 23, 34, 123456), | |
1214 | ] | |
1215 | ||
1216 | s = pa.timestamp('s') | |
1217 | ms = pa.timestamp('ms') | |
1218 | us = pa.timestamp('us') | |
1219 | ||
1220 | arr_s = pa.array(data, type=s) | |
1221 | assert len(arr_s) == 1 | |
1222 | assert arr_s.type == s | |
1223 | assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
1224 | 23, 34, 0) | |
1225 | ||
1226 | arr_ms = pa.array(data, type=ms) | |
1227 | assert len(arr_ms) == 1 | |
1228 | assert arr_ms.type == ms | |
1229 | assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
1230 | 23, 34, 123000) | |
1231 | ||
1232 | arr_us = pa.array(data, type=us) | |
1233 | assert len(arr_us) == 1 | |
1234 | assert arr_us.type == us | |
1235 | assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
1236 | 23, 34, 123456) | |
1237 | ||
1238 | data = [ | |
1239 | MyTimedelta(123, 456, 1002), | |
1240 | ] | |
1241 | ||
1242 | s = pa.duration('s') | |
1243 | ms = pa.duration('ms') | |
1244 | us = pa.duration('us') | |
1245 | ||
1246 | arr_s = pa.array(data) | |
1247 | assert len(arr_s) == 1 | |
1248 | assert arr_s.type == us | |
1249 | assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002) | |
1250 | ||
1251 | arr_s = pa.array(data, type=s) | |
1252 | assert len(arr_s) == 1 | |
1253 | assert arr_s.type == s | |
1254 | assert arr_s[0].as_py() == datetime.timedelta(123, 456) | |
1255 | ||
1256 | arr_ms = pa.array(data, type=ms) | |
1257 | assert len(arr_ms) == 1 | |
1258 | assert arr_ms.type == ms | |
1259 | assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000) | |
1260 | ||
1261 | arr_us = pa.array(data, type=us) | |
1262 | assert len(arr_us) == 1 | |
1263 | assert arr_us.type == us | |
1264 | assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002) | |
1265 | ||
1266 | ||
1267 | @pytest.mark.xfail(not _pandas_api.have_pandas, | |
1268 | reason="pandas required for nanosecond conversion") | |
1269 | def test_sequence_timestamp_nanoseconds(): | |
1270 | inputs = [ | |
1271 | [datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)], | |
1272 | [MyDatetime(2007, 7, 13, 1, 23, 34, 123456)] | |
1273 | ] | |
1274 | ||
1275 | for data in inputs: | |
1276 | ns = pa.timestamp('ns') | |
1277 | arr_ns = pa.array(data, type=ns) | |
1278 | assert len(arr_ns) == 1 | |
1279 | assert arr_ns.type == ns | |
1280 | assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1, | |
1281 | 23, 34, 123456) | |
1282 | ||
1283 | ||
1284 | @pytest.mark.pandas | |
1285 | def test_sequence_timestamp_from_int_with_unit(): | |
1286 | # TODO(wesm): This test might be rewritten to assert the actual behavior | |
1287 | # when pandas is not installed | |
1288 | ||
1289 | data = [1] | |
1290 | ||
1291 | s = pa.timestamp('s') | |
1292 | ms = pa.timestamp('ms') | |
1293 | us = pa.timestamp('us') | |
1294 | ns = pa.timestamp('ns') | |
1295 | ||
1296 | arr_s = pa.array(data, type=s) | |
1297 | assert len(arr_s) == 1 | |
1298 | assert arr_s.type == s | |
1299 | assert repr(arr_s[0]) == ( | |
1300 | "<pyarrow.TimestampScalar: datetime.datetime(1970, 1, 1, 0, 0, 1)>" | |
1301 | ) | |
1302 | assert str(arr_s[0]) == "1970-01-01 00:00:01" | |
1303 | ||
1304 | arr_ms = pa.array(data, type=ms) | |
1305 | assert len(arr_ms) == 1 | |
1306 | assert arr_ms.type == ms | |
1307 | assert repr(arr_ms[0].as_py()) == ( | |
1308 | "datetime.datetime(1970, 1, 1, 0, 0, 0, 1000)" | |
1309 | ) | |
1310 | assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000" | |
1311 | ||
1312 | arr_us = pa.array(data, type=us) | |
1313 | assert len(arr_us) == 1 | |
1314 | assert arr_us.type == us | |
1315 | assert repr(arr_us[0].as_py()) == ( | |
1316 | "datetime.datetime(1970, 1, 1, 0, 0, 0, 1)" | |
1317 | ) | |
1318 | assert str(arr_us[0]) == "1970-01-01 00:00:00.000001" | |
1319 | ||
1320 | arr_ns = pa.array(data, type=ns) | |
1321 | assert len(arr_ns) == 1 | |
1322 | assert arr_ns.type == ns | |
1323 | assert repr(arr_ns[0].as_py()) == ( | |
1324 | "Timestamp('1970-01-01 00:00:00.000000001')" | |
1325 | ) | |
1326 | assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001" | |
1327 | ||
1328 | expected_exc = TypeError | |
1329 | ||
1330 | class CustomClass(): | |
1331 | pass | |
1332 | ||
1333 | for ty in [ns, pa.date32(), pa.date64()]: | |
1334 | with pytest.raises(expected_exc): | |
1335 | pa.array([1, CustomClass()], type=ty) | |
1336 | ||
1337 | ||
1338 | @pytest.mark.parametrize('np_scalar', [True, False]) | |
1339 | def test_sequence_duration(np_scalar): | |
1340 | td1 = datetime.timedelta(2, 3601, 1) | |
1341 | td2 = datetime.timedelta(1, 100, 1000) | |
1342 | if np_scalar: | |
1343 | data = [np.timedelta64(td1), None, np.timedelta64(td2)] | |
1344 | else: | |
1345 | data = [td1, None, td2] | |
1346 | ||
1347 | arr = pa.array(data) | |
1348 | assert len(arr) == 3 | |
1349 | assert arr.type == pa.duration('us') | |
1350 | assert arr.null_count == 1 | |
1351 | assert arr[0].as_py() == td1 | |
1352 | assert arr[1].as_py() is None | |
1353 | assert arr[2].as_py() == td2 | |
1354 | ||
1355 | ||
1356 | @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) | |
1357 | def test_sequence_duration_with_unit(unit): | |
1358 | data = [ | |
1359 | datetime.timedelta(3, 22, 1001), | |
1360 | ] | |
1361 | expected = {'s': datetime.timedelta(3, 22), | |
1362 | 'ms': datetime.timedelta(3, 22, 1000), | |
1363 | 'us': datetime.timedelta(3, 22, 1001), | |
1364 | 'ns': datetime.timedelta(3, 22, 1001)} | |
1365 | ||
1366 | ty = pa.duration(unit) | |
1367 | ||
1368 | arr_s = pa.array(data, type=ty) | |
1369 | assert len(arr_s) == 1 | |
1370 | assert arr_s.type == ty | |
1371 | assert arr_s[0].as_py() == expected[unit] | |
1372 | ||
1373 | ||
1374 | @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) | |
1375 | def test_sequence_duration_from_int_with_unit(unit): | |
1376 | data = [5] | |
1377 | ||
1378 | ty = pa.duration(unit) | |
1379 | arr = pa.array(data, type=ty) | |
1380 | assert len(arr) == 1 | |
1381 | assert arr.type == ty | |
1382 | assert arr[0].value == 5 | |
1383 | ||
1384 | ||
1385 | def test_sequence_duration_nested_lists(): | |
1386 | td1 = datetime.timedelta(1, 1, 1000) | |
1387 | td2 = datetime.timedelta(1, 100) | |
1388 | ||
1389 | data = [[td1, None], [td1, td2]] | |
1390 | ||
1391 | arr = pa.array(data) | |
1392 | assert len(arr) == 2 | |
1393 | assert arr.type == pa.list_(pa.duration('us')) | |
1394 | assert arr.to_pylist() == data | |
1395 | ||
1396 | arr = pa.array(data, type=pa.list_(pa.duration('ms'))) | |
1397 | assert len(arr) == 2 | |
1398 | assert arr.type == pa.list_(pa.duration('ms')) | |
1399 | assert arr.to_pylist() == data | |
1400 | ||
1401 | ||
1402 | def test_sequence_duration_nested_lists_numpy(): | |
1403 | td1 = datetime.timedelta(1, 1, 1000) | |
1404 | td2 = datetime.timedelta(1, 100) | |
1405 | ||
1406 | data = [[np.timedelta64(td1), None], | |
1407 | [np.timedelta64(td1), np.timedelta64(td2)]] | |
1408 | ||
1409 | arr = pa.array(data) | |
1410 | assert len(arr) == 2 | |
1411 | assert arr.type == pa.list_(pa.duration('us')) | |
1412 | assert arr.to_pylist() == [[td1, None], [td1, td2]] | |
1413 | ||
1414 | data = [np.array([np.timedelta64(td1), None], dtype='timedelta64[us]'), | |
1415 | np.array([np.timedelta64(td1), np.timedelta64(td2)])] | |
1416 | ||
1417 | arr = pa.array(data) | |
1418 | assert len(arr) == 2 | |
1419 | assert arr.type == pa.list_(pa.duration('us')) | |
1420 | assert arr.to_pylist() == [[td1, None], [td1, td2]] | |
1421 | ||
1422 | ||
1423 | def test_sequence_nesting_levels(): | |
1424 | data = [1, 2, None] | |
1425 | arr = pa.array(data) | |
1426 | assert arr.type == pa.int64() | |
1427 | assert arr.to_pylist() == data | |
1428 | ||
1429 | data = [[1], [2], None] | |
1430 | arr = pa.array(data) | |
1431 | assert arr.type == pa.list_(pa.int64()) | |
1432 | assert arr.to_pylist() == data | |
1433 | ||
1434 | data = [[1], [2, 3, 4], [None]] | |
1435 | arr = pa.array(data) | |
1436 | assert arr.type == pa.list_(pa.int64()) | |
1437 | assert arr.to_pylist() == data | |
1438 | ||
1439 | data = [None, [[None, 1]], [[2, 3, 4], None], [None]] | |
1440 | arr = pa.array(data) | |
1441 | assert arr.type == pa.list_(pa.list_(pa.int64())) | |
1442 | assert arr.to_pylist() == data | |
1443 | ||
1444 | exceptions = (pa.ArrowInvalid, pa.ArrowTypeError) | |
1445 | ||
1446 | # Mixed nesting levels are rejected | |
1447 | with pytest.raises(exceptions): | |
1448 | pa.array([1, 2, [1]]) | |
1449 | ||
1450 | with pytest.raises(exceptions): | |
1451 | pa.array([1, 2, []]) | |
1452 | ||
1453 | with pytest.raises(exceptions): | |
1454 | pa.array([[1], [2], [None, [1]]]) | |
1455 | ||
1456 | ||
1457 | def test_sequence_mixed_types_fails(): | |
1458 | data = ['a', 1, 2.0] | |
1459 | with pytest.raises(pa.ArrowTypeError): | |
1460 | pa.array(data) | |
1461 | ||
1462 | ||
1463 | def test_sequence_mixed_types_with_specified_type_fails(): | |
1464 | data = ['-10', '-5', {'a': 1}, '0', '5', '10'] | |
1465 | ||
1466 | type = pa.string() | |
1467 | with pytest.raises(TypeError): | |
1468 | pa.array(data, type=type) | |
1469 | ||
1470 | ||
1471 | def test_sequence_decimal(): | |
1472 | data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')] | |
1473 | for type in [pa.decimal128, pa.decimal256]: | |
1474 | arr = pa.array(data, type=type(precision=7, scale=3)) | |
1475 | assert arr.to_pylist() == data | |
1476 | ||
1477 | ||
1478 | def test_sequence_decimal_different_precisions(): | |
1479 | data = [ | |
1480 | decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') | |
1481 | ] | |
1482 | for type in [pa.decimal128, pa.decimal256]: | |
1483 | arr = pa.array(data, type=type(precision=13, scale=3)) | |
1484 | assert arr.to_pylist() == data | |
1485 | ||
1486 | ||
1487 | def test_sequence_decimal_no_scale(): | |
1488 | data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')] | |
1489 | for type in [pa.decimal128, pa.decimal256]: | |
1490 | arr = pa.array(data, type=type(precision=10)) | |
1491 | assert arr.to_pylist() == data | |
1492 | ||
1493 | ||
1494 | def test_sequence_decimal_negative(): | |
1495 | data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')] | |
1496 | for type in [pa.decimal128, pa.decimal256]: | |
1497 | arr = pa.array(data, type=type(precision=10, scale=6)) | |
1498 | assert arr.to_pylist() == data | |
1499 | ||
1500 | ||
1501 | def test_sequence_decimal_no_whole_part(): | |
1502 | data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')] | |
1503 | for type in [pa.decimal128, pa.decimal256]: | |
1504 | arr = pa.array(data, type=type(precision=7, scale=7)) | |
1505 | assert arr.to_pylist() == data | |
1506 | ||
1507 | ||
1508 | def test_sequence_decimal_large_integer(): | |
1509 | data = [decimal.Decimal('-394029506937548693.42983'), | |
1510 | decimal.Decimal('32358695912932.01033')] | |
1511 | for type in [pa.decimal128, pa.decimal256]: | |
1512 | arr = pa.array(data, type=type(precision=23, scale=5)) | |
1513 | assert arr.to_pylist() == data | |
1514 | ||
1515 | ||
1516 | def test_sequence_decimal_from_integers(): | |
1517 | data = [0, 1, -39402950693754869342983] | |
1518 | expected = [decimal.Decimal(x) for x in data] | |
1519 | for type in [pa.decimal128, pa.decimal256]: | |
1520 | arr = pa.array(data, type=type(precision=28, scale=5)) | |
1521 | assert arr.to_pylist() == expected | |
1522 | ||
1523 | ||
1524 | def test_sequence_decimal_too_high_precision(): | |
1525 | # ARROW-6989 python decimal has too high precision | |
1526 | with pytest.raises(ValueError, match="precision out of range"): | |
1527 | pa.array([decimal.Decimal('1' * 80)]) | |
1528 | ||
1529 | ||
1530 | def test_sequence_decimal_infer(): | |
1531 | for data, typ in [ | |
1532 | # simple case | |
1533 | (decimal.Decimal('1.234'), pa.decimal128(4, 3)), | |
1534 | # trailing zeros | |
1535 | (decimal.Decimal('12300'), pa.decimal128(5, 0)), | |
1536 | (decimal.Decimal('12300.0'), pa.decimal128(6, 1)), | |
1537 | # scientific power notation | |
1538 | (decimal.Decimal('1.23E+4'), pa.decimal128(5, 0)), | |
1539 | (decimal.Decimal('123E+2'), pa.decimal128(5, 0)), | |
1540 | (decimal.Decimal('123E+4'), pa.decimal128(7, 0)), | |
1541 | # leading zeros | |
1542 | (decimal.Decimal('0.0123'), pa.decimal128(4, 4)), | |
1543 | (decimal.Decimal('0.01230'), pa.decimal128(5, 5)), | |
1544 | (decimal.Decimal('1.230E-2'), pa.decimal128(5, 5)), | |
1545 | ]: | |
1546 | assert pa.infer_type([data]) == typ | |
1547 | arr = pa.array([data]) | |
1548 | assert arr.type == typ | |
1549 | assert arr.to_pylist()[0] == data | |
1550 | ||
1551 | ||
1552 | def test_sequence_decimal_infer_mixed(): | |
1553 | # ARROW-12150 - ensure mixed precision gets correctly inferred to | |
1554 | # common type that can hold all input values | |
1555 | cases = [ | |
1556 | ([decimal.Decimal('1.234'), decimal.Decimal('3.456')], | |
1557 | pa.decimal128(4, 3)), | |
1558 | ([decimal.Decimal('1.234'), decimal.Decimal('456.7')], | |
1559 | pa.decimal128(6, 3)), | |
1560 | ([decimal.Decimal('123.4'), decimal.Decimal('4.567')], | |
1561 | pa.decimal128(6, 3)), | |
1562 | ([decimal.Decimal('123e2'), decimal.Decimal('4567e3')], | |
1563 | pa.decimal128(7, 0)), | |
1564 | ([decimal.Decimal('123e4'), decimal.Decimal('4567e2')], | |
1565 | pa.decimal128(7, 0)), | |
1566 | ([decimal.Decimal('0.123'), decimal.Decimal('0.04567')], | |
1567 | pa.decimal128(5, 5)), | |
1568 | ([decimal.Decimal('0.001'), decimal.Decimal('1.01E5')], | |
1569 | pa.decimal128(9, 3)), | |
1570 | ] | |
1571 | for data, typ in cases: | |
1572 | assert pa.infer_type(data) == typ | |
1573 | arr = pa.array(data) | |
1574 | assert arr.type == typ | |
1575 | assert arr.to_pylist() == data | |
1576 | ||
1577 | ||
1578 | def test_sequence_decimal_given_type(): | |
1579 | for data, typs, wrong_typs in [ | |
1580 | # simple case | |
1581 | ( | |
1582 | decimal.Decimal('1.234'), | |
1583 | [pa.decimal128(4, 3), pa.decimal128(5, 3), pa.decimal128(5, 4)], | |
1584 | [pa.decimal128(4, 2), pa.decimal128(4, 4)] | |
1585 | ), | |
1586 | # trailing zeros | |
1587 | ( | |
1588 | decimal.Decimal('12300'), | |
1589 | [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)], | |
1590 | [pa.decimal128(4, 0), pa.decimal128(3, -3)] | |
1591 | ), | |
1592 | # scientific power notation | |
1593 | ( | |
1594 | decimal.Decimal('1.23E+4'), | |
1595 | [pa.decimal128(5, 0), pa.decimal128(6, 0), pa.decimal128(3, -2)], | |
1596 | [pa.decimal128(4, 0), pa.decimal128(3, -3)] | |
1597 | ), | |
1598 | ]: | |
1599 | for typ in typs: | |
1600 | arr = pa.array([data], type=typ) | |
1601 | assert arr.type == typ | |
1602 | assert arr.to_pylist()[0] == data | |
1603 | for typ in wrong_typs: | |
1604 | with pytest.raises(ValueError): | |
1605 | pa.array([data], type=typ) | |
1606 | ||
1607 | ||
1608 | def test_range_types(): | |
1609 | arr1 = pa.array(range(3)) | |
1610 | arr2 = pa.array((0, 1, 2)) | |
1611 | assert arr1.equals(arr2) | |
1612 | ||
1613 | ||
1614 | def test_empty_range(): | |
1615 | arr = pa.array(range(0)) | |
1616 | assert len(arr) == 0 | |
1617 | assert arr.null_count == 0 | |
1618 | assert arr.type == pa.null() | |
1619 | assert arr.to_pylist() == [] | |
1620 | ||
1621 | ||
1622 | def test_structarray(): | |
1623 | arr = pa.StructArray.from_arrays([], names=[]) | |
1624 | assert arr.type == pa.struct([]) | |
1625 | assert len(arr) == 0 | |
1626 | assert arr.to_pylist() == [] | |
1627 | ||
1628 | ints = pa.array([None, 2, 3], type=pa.int64()) | |
1629 | strs = pa.array(['a', None, 'c'], type=pa.string()) | |
1630 | bools = pa.array([True, False, None], type=pa.bool_()) | |
1631 | arr = pa.StructArray.from_arrays( | |
1632 | [ints, strs, bools], | |
1633 | ['ints', 'strs', 'bools']) | |
1634 | ||
1635 | expected = [ | |
1636 | {'ints': None, 'strs': 'a', 'bools': True}, | |
1637 | {'ints': 2, 'strs': None, 'bools': False}, | |
1638 | {'ints': 3, 'strs': 'c', 'bools': None}, | |
1639 | ] | |
1640 | ||
1641 | pylist = arr.to_pylist() | |
1642 | assert pylist == expected, (pylist, expected) | |
1643 | ||
1644 | # len(names) != len(arrays) | |
1645 | with pytest.raises(ValueError): | |
1646 | pa.StructArray.from_arrays([ints], ['ints', 'strs']) | |
1647 | ||
1648 | ||
1649 | def test_struct_from_dicts(): | |
1650 | ty = pa.struct([pa.field('a', pa.int32()), | |
1651 | pa.field('b', pa.string()), | |
1652 | pa.field('c', pa.bool_())]) | |
1653 | arr = pa.array([], type=ty) | |
1654 | assert arr.to_pylist() == [] | |
1655 | ||
1656 | data = [{'a': 5, 'b': 'foo', 'c': True}, | |
1657 | {'a': 6, 'b': 'bar', 'c': False}] | |
1658 | arr = pa.array(data, type=ty) | |
1659 | assert arr.to_pylist() == data | |
1660 | ||
1661 | # With omitted values | |
1662 | data = [{'a': 5, 'c': True}, | |
1663 | None, | |
1664 | {}, | |
1665 | {'a': None, 'b': 'bar'}] | |
1666 | arr = pa.array(data, type=ty) | |
1667 | expected = [{'a': 5, 'b': None, 'c': True}, | |
1668 | None, | |
1669 | {'a': None, 'b': None, 'c': None}, | |
1670 | {'a': None, 'b': 'bar', 'c': None}] | |
1671 | assert arr.to_pylist() == expected | |
1672 | ||
1673 | ||
1674 | def test_struct_from_dicts_bytes_keys(): | |
1675 | # ARROW-6878 | |
1676 | ty = pa.struct([pa.field('a', pa.int32()), | |
1677 | pa.field('b', pa.string()), | |
1678 | pa.field('c', pa.bool_())]) | |
1679 | arr = pa.array([], type=ty) | |
1680 | assert arr.to_pylist() == [] | |
1681 | ||
1682 | data = [{b'a': 5, b'b': 'foo'}, | |
1683 | {b'a': 6, b'c': False}] | |
1684 | arr = pa.array(data, type=ty) | |
1685 | assert arr.to_pylist() == [ | |
1686 | {'a': 5, 'b': 'foo', 'c': None}, | |
1687 | {'a': 6, 'b': None, 'c': False}, | |
1688 | ] | |
1689 | ||
1690 | ||
1691 | def test_struct_from_tuples(): | |
1692 | ty = pa.struct([pa.field('a', pa.int32()), | |
1693 | pa.field('b', pa.string()), | |
1694 | pa.field('c', pa.bool_())]) | |
1695 | ||
1696 | data = [(5, 'foo', True), | |
1697 | (6, 'bar', False)] | |
1698 | expected = [{'a': 5, 'b': 'foo', 'c': True}, | |
1699 | {'a': 6, 'b': 'bar', 'c': False}] | |
1700 | arr = pa.array(data, type=ty) | |
1701 | ||
1702 | data_as_ndarray = np.empty(len(data), dtype=object) | |
1703 | data_as_ndarray[:] = data | |
1704 | arr2 = pa.array(data_as_ndarray, type=ty) | |
1705 | assert arr.to_pylist() == expected | |
1706 | ||
1707 | assert arr.equals(arr2) | |
1708 | ||
1709 | # With omitted values | |
1710 | data = [(5, 'foo', None), | |
1711 | None, | |
1712 | (6, None, False)] | |
1713 | expected = [{'a': 5, 'b': 'foo', 'c': None}, | |
1714 | None, | |
1715 | {'a': 6, 'b': None, 'c': False}] | |
1716 | arr = pa.array(data, type=ty) | |
1717 | assert arr.to_pylist() == expected | |
1718 | ||
1719 | # Invalid tuple size | |
1720 | for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: | |
1721 | with pytest.raises(ValueError, match="(?i)tuple size"): | |
1722 | pa.array([tup], type=ty) | |
1723 | ||
1724 | ||
1725 | def test_struct_from_list_of_pairs(): | |
1726 | ty = pa.struct([ | |
1727 | pa.field('a', pa.int32()), | |
1728 | pa.field('b', pa.string()), | |
1729 | pa.field('c', pa.bool_()) | |
1730 | ]) | |
1731 | data = [ | |
1732 | [('a', 5), ('b', 'foo'), ('c', True)], | |
1733 | [('a', 6), ('b', 'bar'), ('c', False)], | |
1734 | None | |
1735 | ] | |
1736 | arr = pa.array(data, type=ty) | |
1737 | assert arr.to_pylist() == [ | |
1738 | {'a': 5, 'b': 'foo', 'c': True}, | |
1739 | {'a': 6, 'b': 'bar', 'c': False}, | |
1740 | None | |
1741 | ] | |
1742 | ||
1743 | # test with duplicated field names | |
1744 | ty = pa.struct([ | |
1745 | pa.field('a', pa.int32()), | |
1746 | pa.field('a', pa.string()), | |
1747 | pa.field('b', pa.bool_()) | |
1748 | ]) | |
1749 | data = [ | |
1750 | [('a', 5), ('a', 'foo'), ('b', True)], | |
1751 | [('a', 6), ('a', 'bar'), ('b', False)], | |
1752 | ] | |
1753 | arr = pa.array(data, type=ty) | |
1754 | with pytest.raises(ValueError): | |
1755 | # TODO(kszucs): ARROW-9997 | |
1756 | arr.to_pylist() | |
1757 | ||
1758 | # test with empty elements | |
1759 | ty = pa.struct([ | |
1760 | pa.field('a', pa.int32()), | |
1761 | pa.field('b', pa.string()), | |
1762 | pa.field('c', pa.bool_()) | |
1763 | ]) | |
1764 | data = [ | |
1765 | [], | |
1766 | [('a', 5), ('b', 'foo'), ('c', True)], | |
1767 | [('a', 2), ('b', 'baz')], | |
1768 | [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')], | |
1769 | ] | |
1770 | expected = [ | |
1771 | {'a': None, 'b': None, 'c': None}, | |
1772 | {'a': 5, 'b': 'foo', 'c': True}, | |
1773 | {'a': 2, 'b': 'baz', 'c': None}, | |
1774 | {'a': 1, 'b': 'bar', 'c': False}, | |
1775 | ] | |
1776 | arr = pa.array(data, type=ty) | |
1777 | assert arr.to_pylist() == expected | |
1778 | ||
1779 | ||
1780 | def test_struct_from_list_of_pairs_errors(): | |
1781 | ty = pa.struct([ | |
1782 | pa.field('a', pa.int32()), | |
1783 | pa.field('b', pa.string()), | |
1784 | pa.field('c', pa.bool_()) | |
1785 | ]) | |
1786 | ||
1787 | # test that it raises if the key doesn't match the expected field name | |
1788 | data = [ | |
1789 | [], | |
1790 | [('a', 5), ('c', True), ('b', None)], | |
1791 | ] | |
1792 | msg = "The expected field name is `b` but `c` was given" | |
1793 | with pytest.raises(ValueError, match=msg): | |
1794 | pa.array(data, type=ty) | |
1795 | ||
1796 | # test various errors both at the first position and after because of key | |
1797 | # type inference | |
1798 | template = ( | |
1799 | r"Could not convert {} with type {}: was expecting tuple of " | |
1800 | r"(key, value) pair" | |
1801 | ) | |
1802 | cases = [ | |
1803 | tuple(), # empty key-value pair | |
1804 | tuple('a',), # missing value | |
1805 | tuple('unknown-key',), # not known field name | |
1806 | 'string', # not a tuple | |
1807 | ] | |
1808 | for key_value_pair in cases: | |
1809 | msg = re.escape(template.format( | |
1810 | repr(key_value_pair), type(key_value_pair).__name__ | |
1811 | )) | |
1812 | ||
1813 | with pytest.raises(TypeError, match=msg): | |
1814 | pa.array([ | |
1815 | [key_value_pair], | |
1816 | [('a', 5), ('b', 'foo'), ('c', None)], | |
1817 | ], type=ty) | |
1818 | ||
1819 | with pytest.raises(TypeError, match=msg): | |
1820 | pa.array([ | |
1821 | [('a', 5), ('b', 'foo'), ('c', None)], | |
1822 | [key_value_pair], | |
1823 | ], type=ty) | |
1824 | ||
1825 | ||
1826 | def test_struct_from_mixed_sequence(): | |
1827 | # It is forbidden to mix dicts and tuples when initializing a struct array | |
1828 | ty = pa.struct([pa.field('a', pa.int32()), | |
1829 | pa.field('b', pa.string()), | |
1830 | pa.field('c', pa.bool_())]) | |
1831 | data = [(5, 'foo', True), | |
1832 | {'a': 6, 'b': 'bar', 'c': False}] | |
1833 | with pytest.raises(TypeError): | |
1834 | pa.array(data, type=ty) | |
1835 | ||
1836 | ||
1837 | def test_struct_from_dicts_inference(): | |
1838 | expected_type = pa.struct([pa.field('a', pa.int64()), | |
1839 | pa.field('b', pa.string()), | |
1840 | pa.field('c', pa.bool_())]) | |
1841 | data = [{'a': 5, 'b': 'foo', 'c': True}, | |
1842 | {'a': 6, 'b': 'bar', 'c': False}] | |
1843 | ||
1844 | arr = pa.array(data) | |
1845 | check_struct_type(arr.type, expected_type) | |
1846 | assert arr.to_pylist() == data | |
1847 | ||
1848 | # With omitted values | |
1849 | data = [{'a': 5, 'c': True}, | |
1850 | None, | |
1851 | {}, | |
1852 | {'a': None, 'b': 'bar'}] | |
1853 | expected = [{'a': 5, 'b': None, 'c': True}, | |
1854 | None, | |
1855 | {'a': None, 'b': None, 'c': None}, | |
1856 | {'a': None, 'b': 'bar', 'c': None}] | |
1857 | ||
1858 | arr = pa.array(data) | |
1859 | data_as_ndarray = np.empty(len(data), dtype=object) | |
1860 | data_as_ndarray[:] = data | |
1861 | arr2 = pa.array(data) | |
1862 | ||
1863 | check_struct_type(arr.type, expected_type) | |
1864 | assert arr.to_pylist() == expected | |
1865 | assert arr.equals(arr2) | |
1866 | ||
1867 | # Nested | |
1868 | expected_type = pa.struct([ | |
1869 | pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), | |
1870 | pa.field('ab', pa.bool_())])), | |
1871 | pa.field('b', pa.string())]) | |
1872 | data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, | |
1873 | {'a': {'aa': None, 'ab': False}, 'b': None}, | |
1874 | {'a': None, 'b': 'bar'}] | |
1875 | arr = pa.array(data) | |
1876 | ||
1877 | assert arr.to_pylist() == data | |
1878 | ||
1879 | # Edge cases | |
1880 | arr = pa.array([{}]) | |
1881 | assert arr.type == pa.struct([]) | |
1882 | assert arr.to_pylist() == [{}] | |
1883 | ||
1884 | # Mixing structs and scalars is rejected | |
1885 | with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): | |
1886 | pa.array([1, {'a': 2}]) | |
1887 | ||
1888 | ||
1889 | def test_structarray_from_arrays_coerce(): | |
1890 | # ARROW-1706 | |
1891 | ints = [None, 2, 3] | |
1892 | strs = ['a', None, 'c'] | |
1893 | bools = [True, False, None] | |
1894 | ints_nonnull = [1, 2, 3] | |
1895 | ||
1896 | arrays = [ints, strs, bools, ints_nonnull] | |
1897 | result = pa.StructArray.from_arrays(arrays, | |
1898 | ['ints', 'strs', 'bools', | |
1899 | 'int_nonnull']) | |
1900 | expected = pa.StructArray.from_arrays( | |
1901 | [pa.array(ints, type='int64'), | |
1902 | pa.array(strs, type='utf8'), | |
1903 | pa.array(bools), | |
1904 | pa.array(ints_nonnull, type='int64')], | |
1905 | ['ints', 'strs', 'bools', 'int_nonnull']) | |
1906 | ||
1907 | with pytest.raises(ValueError): | |
1908 | pa.StructArray.from_arrays(arrays) | |
1909 | ||
1910 | assert result.equals(expected) | |
1911 | ||
1912 | ||
1913 | def test_decimal_array_with_none_and_nan(): | |
1914 | values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')] | |
1915 | ||
1916 | with pytest.raises(TypeError): | |
1917 | # ARROW-6227: Without from_pandas=True, NaN is considered a float | |
1918 | array = pa.array(values) | |
1919 | ||
1920 | array = pa.array(values, from_pandas=True) | |
1921 | assert array.type == pa.decimal128(4, 3) | |
1922 | assert array.to_pylist() == values[:2] + [None, None] | |
1923 | ||
1924 | array = pa.array(values, type=pa.decimal128(10, 4), from_pandas=True) | |
1925 | assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None] | |
1926 | ||
1927 | ||
1928 | def test_map_from_dicts(): | |
1929 | data = [[{'key': b'a', 'value': 1}, {'key': b'b', 'value': 2}], | |
1930 | [{'key': b'c', 'value': 3}], | |
1931 | [{'key': b'd', 'value': 4}, {'key': b'e', 'value': 5}, | |
1932 | {'key': b'f', 'value': None}], | |
1933 | [{'key': b'g', 'value': 7}]] | |
1934 | expected = [[(d['key'], d['value']) for d in entry] for entry in data] | |
1935 | ||
1936 | arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) | |
1937 | ||
1938 | assert arr.to_pylist() == expected | |
1939 | ||
1940 | # With omitted values | |
1941 | data[1] = None | |
1942 | expected[1] = None | |
1943 | ||
1944 | arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) | |
1945 | ||
1946 | assert arr.to_pylist() == expected | |
1947 | ||
1948 | # Invalid dictionary | |
1949 | for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: | |
1950 | with pytest.raises(ValueError, match="Invalid Map"): | |
1951 | pa.array([entry], type=pa.map_('i4', 'i4')) | |
1952 | ||
1953 | # Invalid dictionary types | |
1954 | for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]: | |
1955 | with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): | |
1956 | pa.array([entry], type=pa.map_('i4', 'i4')) | |
1957 | ||
1958 | ||
1959 | def test_map_from_tuples(): | |
1960 | expected = [[(b'a', 1), (b'b', 2)], | |
1961 | [(b'c', 3)], | |
1962 | [(b'd', 4), (b'e', 5), (b'f', None)], | |
1963 | [(b'g', 7)]] | |
1964 | ||
1965 | arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) | |
1966 | ||
1967 | assert arr.to_pylist() == expected | |
1968 | ||
1969 | # With omitted values | |
1970 | expected[1] = None | |
1971 | ||
1972 | arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) | |
1973 | ||
1974 | assert arr.to_pylist() == expected | |
1975 | ||
1976 | # Invalid tuple size | |
1977 | for entry in [[(5,)], [()], [('5', 'foo', True)]]: | |
1978 | with pytest.raises(ValueError, match="(?i)tuple size"): | |
1979 | pa.array([entry], type=pa.map_('i4', 'i4')) | |
1980 | ||
1981 | ||
1982 | def test_dictionary_from_boolean(): | |
1983 | typ = pa.dictionary(pa.int8(), value_type=pa.bool_()) | |
1984 | a = pa.array([False, False, True, False, True], type=typ) | |
1985 | assert isinstance(a.type, pa.DictionaryType) | |
1986 | assert a.type.equals(typ) | |
1987 | ||
1988 | expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8()) | |
1989 | expected_dictionary = pa.array([False, True], type=pa.bool_()) | |
1990 | assert a.indices.equals(expected_indices) | |
1991 | assert a.dictionary.equals(expected_dictionary) | |
1992 | ||
1993 | ||
1994 | @pytest.mark.parametrize('value_type', [ | |
1995 | pa.int8(), | |
1996 | pa.int16(), | |
1997 | pa.int32(), | |
1998 | pa.int64(), | |
1999 | pa.uint8(), | |
2000 | pa.uint16(), | |
2001 | pa.uint32(), | |
2002 | pa.uint64(), | |
2003 | pa.float32(), | |
2004 | pa.float64(), | |
2005 | ]) | |
2006 | def test_dictionary_from_integers(value_type): | |
2007 | typ = pa.dictionary(pa.int8(), value_type=value_type) | |
2008 | a = pa.array([1, 2, 1, 1, 2, 3], type=typ) | |
2009 | assert isinstance(a.type, pa.DictionaryType) | |
2010 | assert a.type.equals(typ) | |
2011 | ||
2012 | expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8()) | |
2013 | expected_dictionary = pa.array([1, 2, 3], type=value_type) | |
2014 | assert a.indices.equals(expected_indices) | |
2015 | assert a.dictionary.equals(expected_dictionary) | |
2016 | ||
2017 | ||
2018 | @pytest.mark.parametrize('input_index_type', [ | |
2019 | pa.int8(), | |
2020 | pa.int16(), | |
2021 | pa.int32(), | |
2022 | pa.int64() | |
2023 | ]) | |
2024 | def test_dictionary_index_type(input_index_type): | |
2025 | # dictionary array is constructed using adaptive index type builder, | |
2026 | # but the input index type is considered as the minimal width type to use | |
2027 | ||
2028 | typ = pa.dictionary(input_index_type, value_type=pa.int64()) | |
2029 | arr = pa.array(range(10), type=typ) | |
2030 | assert arr.type.equals(typ) | |
2031 | ||
2032 | ||
2033 | def test_dictionary_is_always_adaptive(): | |
2034 | # dictionary array is constructed using adaptive index type builder, | |
2035 | # meaning that the output index type may be wider than the given index type | |
2036 | # since it depends on the input data | |
2037 | typ = pa.dictionary(pa.int8(), value_type=pa.int64()) | |
2038 | ||
2039 | a = pa.array(range(2**7), type=typ) | |
2040 | expected = pa.dictionary(pa.int8(), pa.int64()) | |
2041 | assert a.type.equals(expected) | |
2042 | ||
2043 | a = pa.array(range(2**7 + 1), type=typ) | |
2044 | expected = pa.dictionary(pa.int16(), pa.int64()) | |
2045 | assert a.type.equals(expected) | |
2046 | ||
2047 | ||
2048 | def test_dictionary_from_strings(): | |
2049 | for value_type in [pa.binary(), pa.string()]: | |
2050 | typ = pa.dictionary(pa.int8(), value_type) | |
2051 | a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ) | |
2052 | ||
2053 | assert isinstance(a.type, pa.DictionaryType) | |
2054 | ||
2055 | expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8()) | |
2056 | expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type) | |
2057 | assert a.indices.equals(expected_indices) | |
2058 | assert a.dictionary.equals(expected_dictionary) | |
2059 | ||
2060 | # fixed size binary type | |
2061 | typ = pa.dictionary(pa.int8(), pa.binary(3)) | |
2062 | a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ) | |
2063 | assert isinstance(a.type, pa.DictionaryType) | |
2064 | ||
2065 | expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8()) | |
2066 | expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3)) | |
2067 | assert a.indices.equals(expected_indices) | |
2068 | assert a.dictionary.equals(expected_dictionary) | |
2069 | ||
2070 | ||
2071 | @pytest.mark.parametrize(('unit', 'expected'), [ | |
2072 | ('s', datetime.timedelta(seconds=-2147483000)), | |
2073 | ('ms', datetime.timedelta(milliseconds=-2147483000)), | |
2074 | ('us', datetime.timedelta(microseconds=-2147483000)), | |
2075 | ('ns', datetime.timedelta(microseconds=-2147483)) | |
2076 | ]) | |
2077 | def test_duration_array_roundtrip_corner_cases(unit, expected): | |
2078 | # Corner case discovered by hypothesis: there were implicit conversions to | |
2079 | # unsigned values resulting wrong values with wrong signs. | |
2080 | ty = pa.duration(unit) | |
2081 | arr = pa.array([-2147483000], type=ty) | |
2082 | restored = pa.array(arr.to_pylist(), type=ty) | |
2083 | assert arr.equals(restored) | |
2084 | ||
2085 | expected_list = [expected] | |
2086 | if unit == 'ns': | |
2087 | # if pandas is available then a pandas Timedelta is returned | |
2088 | try: | |
2089 | import pandas as pd | |
2090 | except ImportError: | |
2091 | pass | |
2092 | else: | |
2093 | expected_list = [pd.Timedelta(-2147483000, unit='ns')] | |
2094 | ||
2095 | assert restored.to_pylist() == expected_list | |
2096 | ||
2097 | ||
2098 | @pytest.mark.pandas | |
2099 | def test_roundtrip_nanosecond_resolution_pandas_temporal_objects(): | |
2100 | # corner case discovered by hypothesis: preserving the nanoseconds on | |
2101 | # conversion from a list of Timedelta and Timestamp objects | |
2102 | import pandas as pd | |
2103 | ||
2104 | ty = pa.duration('ns') | |
2105 | arr = pa.array([9223371273709551616], type=ty) | |
2106 | data = arr.to_pylist() | |
2107 | assert isinstance(data[0], pd.Timedelta) | |
2108 | restored = pa.array(data, type=ty) | |
2109 | assert arr.equals(restored) | |
2110 | assert restored.to_pylist() == [ | |
2111 | pd.Timedelta(9223371273709551616, unit='ns') | |
2112 | ] | |
2113 | ||
2114 | ty = pa.timestamp('ns') | |
2115 | arr = pa.array([9223371273709551616], type=ty) | |
2116 | data = arr.to_pylist() | |
2117 | assert isinstance(data[0], pd.Timestamp) | |
2118 | restored = pa.array(data, type=ty) | |
2119 | assert arr.equals(restored) | |
2120 | assert restored.to_pylist() == [ | |
2121 | pd.Timestamp(9223371273709551616, unit='ns') | |
2122 | ] | |
2123 | ||
2124 | ty = pa.timestamp('ns', tz='US/Eastern') | |
2125 | value = 1604119893000000000 | |
2126 | arr = pa.array([value], type=ty) | |
2127 | data = arr.to_pylist() | |
2128 | assert isinstance(data[0], pd.Timestamp) | |
2129 | restored = pa.array(data, type=ty) | |
2130 | assert arr.equals(restored) | |
2131 | assert restored.to_pylist() == [ | |
2132 | pd.Timestamp(value, unit='ns').tz_localize( | |
2133 | "UTC").tz_convert('US/Eastern') | |
2134 | ] | |
2135 | ||
2136 | ||
2137 | @h.given(past.all_arrays) | |
2138 | def test_array_to_pylist_roundtrip(arr): | |
2139 | seq = arr.to_pylist() | |
2140 | restored = pa.array(seq, type=arr.type) | |
2141 | assert restored.equals(arr) | |
2142 | ||
2143 | ||
2144 | @pytest.mark.large_memory | |
2145 | def test_auto_chunking_binary_like(): | |
2146 | # single chunk | |
2147 | v1 = b'x' * 100000000 | |
2148 | v2 = b'x' * 147483646 | |
2149 | ||
2150 | # single chunk | |
2151 | one_chunk_data = [v1] * 20 + [b'', None, v2] | |
2152 | arr = pa.array(one_chunk_data, type=pa.binary()) | |
2153 | assert isinstance(arr, pa.Array) | |
2154 | assert len(arr) == 23 | |
2155 | assert arr[20].as_py() == b'' | |
2156 | assert arr[21].as_py() is None | |
2157 | assert arr[22].as_py() == v2 | |
2158 | ||
2159 | # two chunks | |
2160 | two_chunk_data = one_chunk_data + [b'two'] | |
2161 | arr = pa.array(two_chunk_data, type=pa.binary()) | |
2162 | assert isinstance(arr, pa.ChunkedArray) | |
2163 | assert arr.num_chunks == 2 | |
2164 | assert len(arr.chunk(0)) == 23 | |
2165 | assert len(arr.chunk(1)) == 1 | |
2166 | assert arr.chunk(0)[20].as_py() == b'' | |
2167 | assert arr.chunk(0)[21].as_py() is None | |
2168 | assert arr.chunk(0)[22].as_py() == v2 | |
2169 | assert arr.chunk(1).to_pylist() == [b'two'] | |
2170 | ||
2171 | # three chunks | |
2172 | three_chunk_data = one_chunk_data * 2 + [b'three', b'three'] | |
2173 | arr = pa.array(three_chunk_data, type=pa.binary()) | |
2174 | assert isinstance(arr, pa.ChunkedArray) | |
2175 | assert arr.num_chunks == 3 | |
2176 | assert len(arr.chunk(0)) == 23 | |
2177 | assert len(arr.chunk(1)) == 23 | |
2178 | assert len(arr.chunk(2)) == 2 | |
2179 | for i in range(2): | |
2180 | assert arr.chunk(i)[20].as_py() == b'' | |
2181 | assert arr.chunk(i)[21].as_py() is None | |
2182 | assert arr.chunk(i)[22].as_py() == v2 | |
2183 | assert arr.chunk(2).to_pylist() == [b'three', b'three'] | |
2184 | ||
2185 | ||
2186 | @pytest.mark.large_memory | |
2187 | def test_auto_chunking_list_of_binary(): | |
2188 | # ARROW-6281 | |
2189 | vals = [['x' * 1024]] * ((2 << 20) + 1) | |
2190 | arr = pa.array(vals) | |
2191 | assert isinstance(arr, pa.ChunkedArray) | |
2192 | assert arr.num_chunks == 2 | |
2193 | assert len(arr.chunk(0)) == 2**21 - 1 | |
2194 | assert len(arr.chunk(1)) == 2 | |
2195 | assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2 | |
2196 | ||
2197 | ||
2198 | @pytest.mark.large_memory | |
2199 | def test_auto_chunking_list_like(): | |
2200 | item = np.ones((2**28,), dtype='uint8') | |
2201 | data = [item] * (2**3 - 1) | |
2202 | arr = pa.array(data, type=pa.list_(pa.uint8())) | |
2203 | assert isinstance(arr, pa.Array) | |
2204 | assert len(arr) == 7 | |
2205 | ||
2206 | item = np.ones((2**28,), dtype='uint8') | |
2207 | data = [item] * 2**3 | |
2208 | arr = pa.array(data, type=pa.list_(pa.uint8())) | |
2209 | assert isinstance(arr, pa.ChunkedArray) | |
2210 | assert arr.num_chunks == 2 | |
2211 | assert len(arr.chunk(0)) == 7 | |
2212 | assert len(arr.chunk(1)) == 1 | |
2213 | chunk = arr.chunk(1) | |
2214 | scalar = chunk[0] | |
2215 | assert isinstance(scalar, pa.ListScalar) | |
2216 | expected = pa.array(item, type=pa.uint8()) | |
2217 | assert scalar.values == expected | |
2218 | ||
2219 | ||
2220 | @pytest.mark.slow | |
2221 | @pytest.mark.large_memory | |
2222 | def test_auto_chunking_map_type(): | |
2223 | # takes ~20 minutes locally | |
2224 | ty = pa.map_(pa.int8(), pa.int8()) | |
2225 | item = [(1, 1)] * 2**28 | |
2226 | data = [item] * 2**3 | |
2227 | arr = pa.array(data, type=ty) | |
2228 | assert isinstance(arr, pa.ChunkedArray) | |
2229 | assert len(arr.chunk(0)) == 7 | |
2230 | assert len(arr.chunk(1)) == 1 | |
2231 | ||
2232 | ||
2233 | @pytest.mark.large_memory | |
2234 | @pytest.mark.parametrize(('ty', 'char'), [ | |
2235 | (pa.string(), 'x'), | |
2236 | (pa.binary(), b'x'), | |
2237 | ]) | |
2238 | def test_nested_auto_chunking(ty, char): | |
2239 | v1 = char * 100000000 | |
2240 | v2 = char * 147483646 | |
2241 | ||
2242 | struct_type = pa.struct([ | |
2243 | pa.field('bool', pa.bool_()), | |
2244 | pa.field('integer', pa.int64()), | |
2245 | pa.field('string-like', ty), | |
2246 | ]) | |
2247 | ||
2248 | data = [{'bool': True, 'integer': 1, 'string-like': v1}] * 20 | |
2249 | data.append({'bool': True, 'integer': 1, 'string-like': v2}) | |
2250 | arr = pa.array(data, type=struct_type) | |
2251 | assert isinstance(arr, pa.Array) | |
2252 | ||
2253 | data.append({'bool': True, 'integer': 1, 'string-like': char}) | |
2254 | arr = pa.array(data, type=struct_type) | |
2255 | assert isinstance(arr, pa.ChunkedArray) | |
2256 | assert arr.num_chunks == 2 | |
2257 | assert len(arr.chunk(0)) == 21 | |
2258 | assert len(arr.chunk(1)) == 1 | |
2259 | assert arr.chunk(1)[0].as_py() == { | |
2260 | 'bool': True, | |
2261 | 'integer': 1, | |
2262 | 'string-like': char | |
2263 | } | |
2264 | ||
2265 | ||
2266 | @pytest.mark.large_memory | |
2267 | def test_array_from_pylist_data_overflow(): | |
2268 | # Regression test for ARROW-12983 | |
2269 | # Data buffer overflow - should result in chunked array | |
2270 | items = [b'a' * 4096] * (2 ** 19) | |
2271 | arr = pa.array(items, type=pa.string()) | |
2272 | assert isinstance(arr, pa.ChunkedArray) | |
2273 | assert len(arr) == 2**19 | |
2274 | assert len(arr.chunks) > 1 | |
2275 | ||
2276 | mask = np.zeros(2**19, bool) | |
2277 | arr = pa.array(items, mask=mask, type=pa.string()) | |
2278 | assert isinstance(arr, pa.ChunkedArray) | |
2279 | assert len(arr) == 2**19 | |
2280 | assert len(arr.chunks) > 1 | |
2281 | ||
2282 | arr = pa.array(items, type=pa.binary()) | |
2283 | assert isinstance(arr, pa.ChunkedArray) | |
2284 | assert len(arr) == 2**19 | |
2285 | assert len(arr.chunks) > 1 | |
2286 | ||
2287 | ||
2288 | @pytest.mark.slow | |
2289 | @pytest.mark.large_memory | |
2290 | def test_array_from_pylist_offset_overflow(): | |
2291 | # Regression test for ARROW-12983 | |
2292 | # Offset buffer overflow - should result in chunked array | |
2293 | # Note this doesn't apply to primitive arrays | |
2294 | items = [b'a'] * (2 ** 31) | |
2295 | arr = pa.array(items, type=pa.string()) | |
2296 | assert isinstance(arr, pa.ChunkedArray) | |
2297 | assert len(arr) == 2**31 | |
2298 | assert len(arr.chunks) > 1 | |
2299 | ||
2300 | mask = np.zeros(2**31, bool) | |
2301 | arr = pa.array(items, mask=mask, type=pa.string()) | |
2302 | assert isinstance(arr, pa.ChunkedArray) | |
2303 | assert len(arr) == 2**31 | |
2304 | assert len(arr.chunks) > 1 | |
2305 | ||
2306 | arr = pa.array(items, type=pa.binary()) | |
2307 | assert isinstance(arr, pa.ChunkedArray) | |
2308 | assert len(arr) == 2**31 | |
2309 | assert len(arr.chunks) > 1 |