]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/python/pyarrow/tests/test_compute.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / tests / test_compute.py
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 from datetime import datetime
19 from functools import lru_cache, partial
20 import inspect
21 import pickle
22 import pytest
23 import random
24 import sys
25 import textwrap
26
27 import numpy as np
28
29 try:
30 import pandas as pd
31 except ImportError:
32 pd = None
33
34 import pyarrow as pa
35 import pyarrow.compute as pc
36
37 all_array_types = [
38 ('bool', [True, False, False, True, True]),
39 ('uint8', np.arange(5)),
40 ('int8', np.arange(5)),
41 ('uint16', np.arange(5)),
42 ('int16', np.arange(5)),
43 ('uint32', np.arange(5)),
44 ('int32', np.arange(5)),
45 ('uint64', np.arange(5, 10)),
46 ('int64', np.arange(5, 10)),
47 ('float', np.arange(0, 0.5, 0.1)),
48 ('double', np.arange(0, 0.5, 0.1)),
49 ('string', ['a', 'b', None, 'ddd', 'ee']),
50 ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
51 (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
52 (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
53 (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
54 (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
55 {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
56 ]
57
58 exported_functions = [
59 func for (name, func) in sorted(pc.__dict__.items())
60 if hasattr(func, '__arrow_compute_function__')]
61
62 exported_option_classes = [
63 cls for (name, cls) in sorted(pc.__dict__.items())
64 if (isinstance(cls, type) and
65 cls is not pc.FunctionOptions and
66 issubclass(cls, pc.FunctionOptions))]
67
68 numerical_arrow_types = [
69 pa.int8(),
70 pa.int16(),
71 pa.int64(),
72 pa.uint8(),
73 pa.uint16(),
74 pa.uint64(),
75 pa.float32(),
76 pa.float64()
77 ]
78
79
80 def test_exported_functions():
81 # Check that all exported concrete functions can be called with
82 # the right number of arguments.
83 # Note that unregistered functions (e.g. with a mismatching name)
84 # will raise KeyError.
85 functions = exported_functions
86 assert len(functions) >= 10
87 for func in functions:
88 arity = func.__arrow_compute_function__['arity']
89 if arity is Ellipsis:
90 args = [object()] * 3
91 else:
92 args = [object()] * arity
93 with pytest.raises(TypeError,
94 match="Got unexpected argument type "
95 "<class 'object'> for compute function"):
96 func(*args)
97
98
99 def test_exported_option_classes():
100 classes = exported_option_classes
101 assert len(classes) >= 10
102 for cls in classes:
103 # Option classes must have an introspectable constructor signature,
104 # and that signature should not have any *args or **kwargs.
105 sig = inspect.signature(cls)
106 for param in sig.parameters.values():
107 assert param.kind not in (param.VAR_POSITIONAL,
108 param.VAR_KEYWORD)
109
110
111 def test_option_class_equality():
112 options = [
113 pc.ArraySortOptions(),
114 pc.AssumeTimezoneOptions("UTC"),
115 pc.CastOptions.safe(pa.int8()),
116 pc.CountOptions(),
117 pc.DayOfWeekOptions(count_from_zero=False, week_start=0),
118 pc.DictionaryEncodeOptions(),
119 pc.ElementWiseAggregateOptions(skip_nulls=True),
120 pc.ExtractRegexOptions("pattern"),
121 pc.FilterOptions(),
122 pc.IndexOptions(pa.scalar(1)),
123 pc.JoinOptions(),
124 pc.MakeStructOptions(["field", "names"],
125 field_nullability=[True, True],
126 field_metadata=[pa.KeyValueMetadata({"a": "1"}),
127 pa.KeyValueMetadata({"b": "2"})]),
128 pc.MatchSubstringOptions("pattern"),
129 pc.ModeOptions(),
130 pc.NullOptions(),
131 pc.PadOptions(5),
132 pc.PartitionNthOptions(1, null_placement="at_start"),
133 pc.QuantileOptions(),
134 pc.ReplaceSliceOptions(0, 1, "a"),
135 pc.ReplaceSubstringOptions("a", "b"),
136 pc.RoundOptions(2, "towards_infinity"),
137 pc.RoundToMultipleOptions(100, "towards_infinity"),
138 pc.ScalarAggregateOptions(),
139 pc.SelectKOptions(0, sort_keys=[("b", "ascending")]),
140 pc.SetLookupOptions(pa.array([1])),
141 pc.SliceOptions(0, 1, 1),
142 pc.SortOptions([("dummy", "descending")], null_placement="at_start"),
143 pc.SplitOptions(),
144 pc.SplitPatternOptions("pattern"),
145 pc.StrftimeOptions(),
146 pc.StrptimeOptions("%Y", "s"),
147 pc.TakeOptions(),
148 pc.TDigestOptions(),
149 pc.TrimOptions(" "),
150 pc.VarianceOptions(),
151 pc.WeekOptions(week_starts_monday=True, count_from_zero=False,
152 first_week_is_fully_in_year=False),
153 ]
154 # TODO: We should test on windows once ARROW-13168 is resolved.
155 # Timezone database is not available on Windows yet
156 if sys.platform != 'win32':
157 options.append(pc.AssumeTimezoneOptions("Europe/Ljubljana"))
158
159 classes = {type(option) for option in options}
160 for cls in exported_option_classes:
161 # Timezone database is not available on Windows yet
162 if cls not in classes and sys.platform != 'win32' and \
163 cls != pc.AssumeTimezoneOptions:
164 try:
165 options.append(cls())
166 except TypeError:
167 pytest.fail(f"Options class is not tested: {cls}")
168 for option in options:
169 assert option == option
170 assert repr(option).startswith(option.__class__.__name__)
171 buf = option.serialize()
172 deserialized = pc.FunctionOptions.deserialize(buf)
173 assert option == deserialized
174 assert repr(option) == repr(deserialized)
175 for option1, option2 in zip(options, options[1:]):
176 assert option1 != option2
177
178 assert repr(pc.IndexOptions(pa.scalar(1))) == "IndexOptions(value=int64:1)"
179 assert repr(pc.ArraySortOptions()) == \
180 "ArraySortOptions(order=Ascending, null_placement=AtEnd)"
181
182
183 def test_list_functions():
184 assert len(pc.list_functions()) > 10
185 assert "add" in pc.list_functions()
186
187
188 def _check_get_function(name, expected_func_cls, expected_ker_cls,
189 min_num_kernels=1):
190 func = pc.get_function(name)
191 assert isinstance(func, expected_func_cls)
192 n = func.num_kernels
193 assert n >= min_num_kernels
194 assert n == len(func.kernels)
195 assert all(isinstance(ker, expected_ker_cls) for ker in func.kernels)
196
197
198 def test_get_function_scalar():
199 _check_get_function("add", pc.ScalarFunction, pc.ScalarKernel, 8)
200
201
202 def test_get_function_vector():
203 _check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8)
204
205
206 def test_get_function_scalar_aggregate():
207 _check_get_function("mean", pc.ScalarAggregateFunction,
208 pc.ScalarAggregateKernel, 8)
209
210
211 def test_get_function_hash_aggregate():
212 _check_get_function("hash_sum", pc.HashAggregateFunction,
213 pc.HashAggregateKernel, 1)
214
215
216 def test_call_function_with_memory_pool():
217 arr = pa.array(["foo", "bar", "baz"])
218 indices = np.array([2, 2, 1])
219 result1 = arr.take(indices)
220 result2 = pc.call_function('take', [arr, indices],
221 memory_pool=pa.default_memory_pool())
222 expected = pa.array(["baz", "baz", "bar"])
223 assert result1.equals(expected)
224 assert result2.equals(expected)
225
226 result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool())
227 assert result3.equals(expected)
228
229
230 def test_pickle_functions():
231 # Pickle registered functions
232 for name in pc.list_functions():
233 func = pc.get_function(name)
234 reconstructed = pickle.loads(pickle.dumps(func))
235 assert type(reconstructed) is type(func)
236 assert reconstructed.name == func.name
237 assert reconstructed.arity == func.arity
238 assert reconstructed.num_kernels == func.num_kernels
239
240
241 def test_pickle_global_functions():
242 # Pickle global wrappers (manual or automatic) of registered functions
243 for name in pc.list_functions():
244 func = getattr(pc, name)
245 reconstructed = pickle.loads(pickle.dumps(func))
246 assert reconstructed is func
247
248
249 def test_function_attributes():
250 # Sanity check attributes of registered functions
251 for name in pc.list_functions():
252 func = pc.get_function(name)
253 assert isinstance(func, pc.Function)
254 assert func.name == name
255 kernels = func.kernels
256 assert func.num_kernels == len(kernels)
257 assert all(isinstance(ker, pc.Kernel) for ker in kernels)
258 if func.arity is not Ellipsis:
259 assert func.arity >= 1
260 repr(func)
261 for ker in kernels:
262 repr(ker)
263
264
265 def test_input_type_conversion():
266 # Automatic array conversion from Python
267 arr = pc.add([1, 2], [4, None])
268 assert arr.to_pylist() == [5, None]
269 # Automatic scalar conversion from Python
270 arr = pc.add([1, 2], 4)
271 assert arr.to_pylist() == [5, 6]
272 # Other scalar type
273 assert pc.equal(["foo", "bar", None],
274 "foo").to_pylist() == [True, False, None]
275
276
277 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
278 def test_sum_array(arrow_type):
279 arr = pa.array([1, 2, 3, 4], type=arrow_type)
280 assert arr.sum().as_py() == 10
281 assert pc.sum(arr).as_py() == 10
282
283 arr = pa.array([1, 2, 3, 4, None], type=arrow_type)
284 assert arr.sum().as_py() == 10
285 assert pc.sum(arr).as_py() == 10
286
287 arr = pa.array([None], type=arrow_type)
288 assert arr.sum().as_py() is None # noqa: E711
289 assert pc.sum(arr).as_py() is None # noqa: E711
290 assert arr.sum(min_count=0).as_py() == 0
291 assert pc.sum(arr, min_count=0).as_py() == 0
292
293 arr = pa.array([], type=arrow_type)
294 assert arr.sum().as_py() is None # noqa: E711
295 assert arr.sum(min_count=0).as_py() == 0
296 assert pc.sum(arr, min_count=0).as_py() == 0
297
298
299 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
300 def test_sum_chunked_array(arrow_type):
301 arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)])
302 assert pc.sum(arr).as_py() == 10
303
304 arr = pa.chunked_array([
305 pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type)
306 ])
307 assert pc.sum(arr).as_py() == 10
308
309 arr = pa.chunked_array([
310 pa.array([1, 2], type=arrow_type),
311 pa.array([], type=arrow_type),
312 pa.array([3, 4], type=arrow_type)
313 ])
314 assert pc.sum(arr).as_py() == 10
315
316 arr = pa.chunked_array((), type=arrow_type)
317 assert arr.num_chunks == 0
318 assert pc.sum(arr).as_py() is None # noqa: E711
319 assert pc.sum(arr, min_count=0).as_py() == 0
320
321
322 def test_mode_array():
323 # ARROW-9917
324 arr = pa.array([1, 1, 3, 4, 3, 5], type='int64')
325 mode = pc.mode(arr)
326 assert len(mode) == 1
327 assert mode[0].as_py() == {"mode": 1, "count": 2}
328
329 mode = pc.mode(arr, n=2)
330 assert len(mode) == 2
331 assert mode[0].as_py() == {"mode": 1, "count": 2}
332 assert mode[1].as_py() == {"mode": 3, "count": 2}
333
334 arr = pa.array([], type='int64')
335 assert len(pc.mode(arr)) == 0
336
337 arr = pa.array([1, 1, 3, 4, 3, None], type='int64')
338 mode = pc.mode(arr, skip_nulls=False)
339 assert len(mode) == 0
340 mode = pc.mode(arr, min_count=6)
341 assert len(mode) == 0
342 mode = pc.mode(arr, skip_nulls=False, min_count=5)
343 assert len(mode) == 0
344
345
346 def test_mode_chunked_array():
347 # ARROW-9917
348 arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')])
349 mode = pc.mode(arr)
350 assert len(mode) == 1
351 assert mode[0].as_py() == {"mode": 1, "count": 2}
352
353 mode = pc.mode(arr, n=2)
354 assert len(mode) == 2
355 assert mode[0].as_py() == {"mode": 1, "count": 2}
356 assert mode[1].as_py() == {"mode": 3, "count": 2}
357
358 arr = pa.chunked_array((), type='int64')
359 assert arr.num_chunks == 0
360 assert len(pc.mode(arr)) == 0
361
362
363 def test_variance():
364 data = [1, 2, 3, 4, 5, 6, 7, 8]
365 assert pc.variance(data).as_py() == 5.25
366 assert pc.variance(data, ddof=0).as_py() == 5.25
367 assert pc.variance(data, ddof=1).as_py() == 6.0
368
369
370 def test_count_substring():
371 for (ty, offset) in [(pa.string(), pa.int32()),
372 (pa.large_string(), pa.int64())]:
373 arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty)
374
375 result = pc.count_substring(arr, "ab")
376 expected = pa.array([1, 1, 2, 0, 0, None], type=offset)
377 assert expected.equals(result)
378
379 result = pc.count_substring(arr, "ab", ignore_case=True)
380 expected = pa.array([1, 1, 2, 0, 1, None], type=offset)
381 assert expected.equals(result)
382
383
384 def test_count_substring_regex():
385 for (ty, offset) in [(pa.string(), pa.int32()),
386 (pa.large_string(), pa.int64())]:
387 arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty)
388
389 result = pc.count_substring_regex(arr, "a+")
390 expected = pa.array([1, 1, 3, 1, 0, None], type=offset)
391 assert expected.equals(result)
392
393 result = pc.count_substring_regex(arr, "a+", ignore_case=True)
394 expected = pa.array([1, 1, 2, 1, 1, None], type=offset)
395 assert expected.equals(result)
396
397
398 def test_find_substring():
399 for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]:
400 arr = pa.array(["ab", "cab", "ba", None], type=ty)
401 result = pc.find_substring(arr, "ab")
402 assert result.to_pylist() == [0, 1, -1, None]
403
404 result = pc.find_substring_regex(arr, "a?b")
405 assert result.to_pylist() == [0, 1, 0, None]
406
407 arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty)
408 result = pc.find_substring(arr, "aB*", ignore_case=True)
409 assert result.to_pylist() == [0, 1, -1, -1]
410
411 result = pc.find_substring_regex(arr, "a?b", ignore_case=True)
412 assert result.to_pylist() == [0, 1, 0, 0]
413
414
415 def test_match_like():
416 arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
417 result = pc.match_like(arr, r"_a\%%")
418 expected = pa.array([False, True, False, True, None])
419 assert expected.equals(result)
420
421 arr = pa.array(["aB", "bA%", "ba", "ca%d", None])
422 result = pc.match_like(arr, r"_a\%%", ignore_case=True)
423 expected = pa.array([False, True, False, True, None])
424 assert expected.equals(result)
425 result = pc.match_like(arr, r"_a\%%", ignore_case=False)
426 expected = pa.array([False, False, False, True, None])
427 assert expected.equals(result)
428
429
430 def test_match_substring():
431 arr = pa.array(["ab", "abc", "ba", None])
432 result = pc.match_substring(arr, "ab")
433 expected = pa.array([True, True, False, None])
434 assert expected.equals(result)
435
436 arr = pa.array(["áB", "Ábc", "ba", None])
437 result = pc.match_substring(arr, "áb", ignore_case=True)
438 expected = pa.array([True, True, False, None])
439 assert expected.equals(result)
440 result = pc.match_substring(arr, "áb", ignore_case=False)
441 expected = pa.array([False, False, False, None])
442 assert expected.equals(result)
443
444
445 def test_match_substring_regex():
446 arr = pa.array(["ab", "abc", "ba", "c", None])
447 result = pc.match_substring_regex(arr, "^a?b")
448 expected = pa.array([True, True, True, False, None])
449 assert expected.equals(result)
450
451 arr = pa.array(["aB", "Abc", "BA", "c", None])
452 result = pc.match_substring_regex(arr, "^a?b", ignore_case=True)
453 expected = pa.array([True, True, True, False, None])
454 assert expected.equals(result)
455 result = pc.match_substring_regex(arr, "^a?b", ignore_case=False)
456 expected = pa.array([False, False, False, False, None])
457 assert expected.equals(result)
458
459
460 def test_trim():
461 # \u3000 is unicode whitespace
462 arr = pa.array([" foo", None, " \u3000foo bar \t"])
463 result = pc.utf8_trim_whitespace(arr)
464 expected = pa.array(["foo", None, "foo bar"])
465 assert expected.equals(result)
466
467 arr = pa.array([" foo", None, " \u3000foo bar \t"])
468 result = pc.ascii_trim_whitespace(arr)
469 expected = pa.array(["foo", None, "\u3000foo bar"])
470 assert expected.equals(result)
471
472 arr = pa.array([" foo", None, " \u3000foo bar \t"])
473 result = pc.utf8_trim(arr, characters=' f\u3000')
474 expected = pa.array(["oo", None, "oo bar \t"])
475 assert expected.equals(result)
476
477
478 def test_slice_compatibility():
479 arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])
480 for start in range(-6, 6):
481 for stop in range(-6, 6):
482 for step in [-3, -2, -1, 1, 2, 3]:
483 expected = pa.array([k.as_py()[start:stop:step]
484 for k in arr])
485 result = pc.utf8_slice_codeunits(
486 arr, start=start, stop=stop, step=step)
487 assert expected.equals(result)
488
489
490 def test_split_pattern():
491 arr = pa.array(["-foo---bar--", "---foo---b"])
492 result = pc.split_pattern(arr, pattern="---")
493 expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]])
494 assert expected.equals(result)
495
496 result = pc.split_pattern(arr, pattern="---", max_splits=1)
497 expected = pa.array([["-foo", "bar--"], ["", "foo---b"]])
498 assert expected.equals(result)
499
500 result = pc.split_pattern(arr, pattern="---", max_splits=1, reverse=True)
501 expected = pa.array([["-foo", "bar--"], ["---foo", "b"]])
502 assert expected.equals(result)
503
504
505 def test_split_whitespace_utf8():
506 arr = pa.array(["foo bar", " foo \u3000\tb"])
507 result = pc.utf8_split_whitespace(arr)
508 expected = pa.array([["foo", "bar"], ["", "foo", "b"]])
509 assert expected.equals(result)
510
511 result = pc.utf8_split_whitespace(arr, max_splits=1)
512 expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]])
513 assert expected.equals(result)
514
515 result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True)
516 expected = pa.array([["foo", "bar"], [" foo", "b"]])
517 assert expected.equals(result)
518
519
520 def test_split_whitespace_ascii():
521 arr = pa.array(["foo bar", " foo \u3000\tb"])
522 result = pc.ascii_split_whitespace(arr)
523 expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]])
524 assert expected.equals(result)
525
526 result = pc.ascii_split_whitespace(arr, max_splits=1)
527 expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]])
528 assert expected.equals(result)
529
530 result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True)
531 expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]])
532 assert expected.equals(result)
533
534
535 def test_split_pattern_regex():
536 arr = pa.array(["-foo---bar--", "---foo---b"])
537 result = pc.split_pattern_regex(arr, pattern="-+")
538 expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]])
539 assert expected.equals(result)
540
541 result = pc.split_pattern_regex(arr, pattern="-+", max_splits=1)
542 expected = pa.array([["", "foo---bar--"], ["", "foo---b"]])
543 assert expected.equals(result)
544
545 with pytest.raises(NotImplementedError,
546 match="Cannot split in reverse with regex"):
547 result = pc.split_pattern_regex(
548 arr, pattern="---", max_splits=1, reverse=True)
549
550
551 def test_min_max():
552 # An example generated function wrapper with possible options
553 data = [4, 5, 6, None, 1]
554 s = pc.min_max(data)
555 assert s.as_py() == {'min': 1, 'max': 6}
556 s = pc.min_max(data, options=pc.ScalarAggregateOptions())
557 assert s.as_py() == {'min': 1, 'max': 6}
558 s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True))
559 assert s.as_py() == {'min': 1, 'max': 6}
560 s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False))
561 assert s.as_py() == {'min': None, 'max': None}
562
563 # Options as dict of kwargs
564 s = pc.min_max(data, options={'skip_nulls': False})
565 assert s.as_py() == {'min': None, 'max': None}
566 # Options as named functions arguments
567 s = pc.min_max(data, skip_nulls=False)
568 assert s.as_py() == {'min': None, 'max': None}
569
570 # Both options and named arguments
571 with pytest.raises(TypeError):
572 s = pc.min_max(
573 data, options=pc.ScalarAggregateOptions(), skip_nulls=False)
574
575 # Wrong options type
576 options = pc.TakeOptions()
577 with pytest.raises(TypeError):
578 s = pc.min_max(data, options=options)
579
580 # Missing argument
581 with pytest.raises(ValueError,
582 match="Function min_max accepts 1 argument"):
583 s = pc.min_max()
584
585
586 def test_any():
587 # ARROW-1846
588
589 options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
590
591 a = pa.array([], type='bool')
592 assert pc.any(a).as_py() is None
593 assert pc.any(a, min_count=0).as_py() is False
594 assert pc.any(a, options=options).as_py() is False
595
596 a = pa.array([False, None, True])
597 assert pc.any(a).as_py() is True
598 assert pc.any(a, options=options).as_py() is True
599
600 a = pa.array([False, None, False])
601 assert pc.any(a).as_py() is False
602 assert pc.any(a, options=options).as_py() is None
603
604
605 def test_all():
606 # ARROW-10301
607
608 options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
609
610 a = pa.array([], type='bool')
611 assert pc.all(a).as_py() is None
612 assert pc.all(a, min_count=0).as_py() is True
613 assert pc.all(a, options=options).as_py() is True
614
615 a = pa.array([False, True])
616 assert pc.all(a).as_py() is False
617 assert pc.all(a, options=options).as_py() is False
618
619 a = pa.array([True, None])
620 assert pc.all(a).as_py() is True
621 assert pc.all(a, options=options).as_py() is None
622
623 a = pa.chunked_array([[True], [True, None]])
624 assert pc.all(a).as_py() is True
625 assert pc.all(a, options=options).as_py() is None
626
627 a = pa.chunked_array([[True], [False]])
628 assert pc.all(a).as_py() is False
629 assert pc.all(a, options=options).as_py() is False
630
631
632 def test_is_valid():
633 # An example generated function wrapper without options
634 data = [4, 5, None]
635 assert pc.is_valid(data).to_pylist() == [True, True, False]
636
637 with pytest.raises(TypeError):
638 pc.is_valid(data, options=None)
639
640
641 def test_generated_docstrings():
642 assert pc.min_max.__doc__ == textwrap.dedent("""\
643 Compute the minimum and maximum values of a numeric array.
644
645 Null values are ignored by default.
646 This can be changed through ScalarAggregateOptions.
647
648 Parameters
649 ----------
650 array : Array-like
651 Argument to compute function
652 memory_pool : pyarrow.MemoryPool, optional
653 If not passed, will allocate memory from the default memory pool.
654 options : pyarrow.compute.ScalarAggregateOptions, optional
655 Parameters altering compute function semantics.
656 skip_nulls : optional
657 Parameter for ScalarAggregateOptions constructor. Either `options`
658 or `skip_nulls` can be passed, but not both at the same time.
659 min_count : optional
660 Parameter for ScalarAggregateOptions constructor. Either `options`
661 or `min_count` can be passed, but not both at the same time.
662 """)
663 assert pc.add.__doc__ == textwrap.dedent("""\
664 Add the arguments element-wise.
665
666 Results will wrap around on integer overflow.
667 Use function "add_checked" if you want overflow
668 to return an error.
669
670 Parameters
671 ----------
672 x : Array-like or scalar-like
673 Argument to compute function
674 y : Array-like or scalar-like
675 Argument to compute function
676 memory_pool : pyarrow.MemoryPool, optional
677 If not passed, will allocate memory from the default memory pool.
678 """)
679
680
681 def test_generated_signatures():
682 # The self-documentation provided by signatures should show acceptable
683 # options and their default values.
684 sig = inspect.signature(pc.add)
685 assert str(sig) == "(x, y, *, memory_pool=None)"
686 sig = inspect.signature(pc.min_max)
687 assert str(sig) == ("(array, *, memory_pool=None, "
688 "options=None, skip_nulls=True, min_count=1)")
689 sig = inspect.signature(pc.quantile)
690 assert str(sig) == ("(array, *, memory_pool=None, "
691 "options=None, q=0.5, interpolation='linear', "
692 "skip_nulls=True, min_count=0)")
693 sig = inspect.signature(pc.binary_join_element_wise)
694 assert str(sig) == ("(*strings, memory_pool=None, options=None, "
695 "null_handling='emit_null', null_replacement='')")
696
697
698 # We use isprintable to find about codepoints that Python doesn't know, but
699 # utf8proc does (or in a future version of Python the other way around).
700 # These codepoints cannot be compared between Arrow and the Python
701 # implementation.
702 @lru_cache()
703 def find_new_unicode_codepoints():
704 new = set()
705 characters = [chr(c) for c in range(0x80, 0x11000)
706 if not (0xD800 <= c < 0xE000)]
707 is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist()
708 for i, c in enumerate(characters):
709 if is_printable[i] != c.isprintable():
710 new.add(ord(c))
711 return new
712
713
714 # Python claims there are not alpha, not sure why, they are in
715 # gc='Other Letter': https://graphemica.com/%E1%B3%B2
716 unknown_issue_is_alpha = {0x1cf2, 0x1cf3}
717 # utf8proc does not know if codepoints are lower case
718 utf8proc_issue_is_lower = {
719 0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
720 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
721 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
722 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33,
723 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39,
724 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f,
725 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45,
726 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b,
727 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51,
728 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57,
729 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d,
730 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63,
731 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69,
732 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e,
733 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4,
734 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa,
735 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0,
736 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6,
737 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc,
738 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090,
739 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096,
740 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c,
741 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8,
742 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, }
743 # utf8proc does not store if a codepoint is numeric
744 numeric_info_missing = {
745 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
746 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
747 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
748 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
749 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
750 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
751 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
752 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
753 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
754 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
755 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5,
756 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca,
757 0x10fcb, }
758 # utf8proc has no no digit/numeric information
759 digit_info_missing = {
760 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
761 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070,
762 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080,
763 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
764 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464,
765 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476,
766 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488,
767 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f,
768 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9,
769 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777,
770 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e,
771 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786,
772 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e,
773 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41,
774 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63,
775 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, }
776 numeric_info_missing = {
777 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
778 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
779 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
780 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
781 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
782 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
783 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
784 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
785 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
786 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
787 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }
788
789 codepoints_ignore = {
790 'is_alnum': numeric_info_missing | digit_info_missing |
791 unknown_issue_is_alpha,
792 'is_alpha': unknown_issue_is_alpha,
793 'is_digit': digit_info_missing,
794 'is_numeric': numeric_info_missing,
795 'is_lower': utf8proc_issue_is_lower
796 }
797
798
799 @pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha',
800 'is_ascii', 'is_decimal',
801 'is_digit', 'is_lower',
802 'is_numeric', 'is_printable',
803 'is_space', 'is_upper', ])
804 @pytest.mark.parametrize('variant', ['ascii', 'utf8'])
805 def test_string_py_compat_boolean(function_name, variant):
806 arrow_name = variant + "_" + function_name
807 py_name = function_name.replace('_', '')
808 ignore = codepoints_ignore.get(function_name, set()) | \
809 find_new_unicode_codepoints()
810 for i in range(128 if ascii else 0x11000):
811 if i in range(0xD800, 0xE000):
812 continue # bug? pyarrow doesn't allow utf16 surrogates
813 # the issues we know of, we skip
814 if i in ignore:
815 continue
816 # Compare results with the equivalent Python predicate
817 # (except "is_space" where functions are known to be incompatible)
818 c = chr(i)
819 if hasattr(pc, arrow_name) and function_name != 'is_space':
820 ar = pa.array([c])
821 arrow_func = getattr(pc, arrow_name)
822 assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()
823
824
825 def test_pad():
826 arr = pa.array([None, 'a', 'abcd'])
827 assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd']
828 assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd']
829 assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd']
830
831 arr = pa.array([None, 'á', 'abcd'])
832 assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd']
833 assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd']
834 assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd']
835
836
837 @pytest.mark.pandas
838 def test_replace_slice():
839 offsets = range(-3, 4)
840
841 arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde'])
842 series = arr.to_pandas()
843 for start in offsets:
844 for stop in offsets:
845 expected = series.str.slice_replace(start, stop, 'XX')
846 actual = pc.binary_replace_slice(
847 arr, start=start, stop=stop, replacement='XX')
848 assert actual.tolist() == expected.tolist()
849
850 arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde'])
851 series = arr.to_pandas()
852 for start in offsets:
853 for stop in offsets:
854 expected = series.str.slice_replace(start, stop, 'XX')
855 actual = pc.utf8_replace_slice(
856 arr, start=start, stop=stop, replacement='XX')
857 assert actual.tolist() == expected.tolist()
858
859
860 def test_replace_plain():
861 ar = pa.array(['foo', 'food', None])
862 ar = pc.replace_substring(ar, pattern='foo', replacement='bar')
863 assert ar.tolist() == ['bar', 'bard', None]
864
865
866 def test_replace_regex():
867 ar = pa.array(['foo', 'mood', None])
868 ar = pc.replace_substring_regex(ar, pattern='(.)oo', replacement=r'\100')
869 assert ar.tolist() == ['f00', 'm00d', None]
870
871
872 def test_extract_regex():
873 ar = pa.array(['a1', 'zb2z'])
874 struct = pc.extract_regex(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d)')
875 assert struct.tolist() == [{'letter': 'a', 'digit': '1'}, {
876 'letter': 'b', 'digit': '2'}]
877
878
879 def test_binary_join():
880 ar_list = pa.array([['foo', 'bar'], None, []])
881 expected = pa.array(['foo-bar', None, ''])
882 assert pc.binary_join(ar_list, '-').equals(expected)
883
884 separator_array = pa.array(['1', '2'], type=pa.binary())
885 expected = pa.array(['a1b', 'c2d'], type=pa.binary())
886 ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary()))
887 assert pc.binary_join(ar_list, separator_array).equals(expected)
888
889
890 def test_binary_join_element_wise():
891 null = pa.scalar(None, type=pa.string())
892 arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']]
893 assert pc.binary_join_element_wise(*arrs).to_pylist() == \
894 [None, None, 'b--d']
895 assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b'
896 assert pc.binary_join_element_wise('a', null, '-').as_py() is None
897 assert pc.binary_join_element_wise('a', 'b', null).as_py() is None
898
899 skip = pc.JoinOptions(null_handling='skip')
900 assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \
901 [None, 'a', 'b--d']
902 assert pc.binary_join_element_wise(
903 'a', 'b', '-', options=skip).as_py() == 'a-b'
904 assert pc.binary_join_element_wise(
905 'a', null, '-', options=skip).as_py() == 'a'
906 assert pc.binary_join_element_wise(
907 'a', 'b', null, options=skip).as_py() is None
908
909 replace = pc.JoinOptions(null_handling='replace', null_replacement='spam')
910 assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \
911 [None, 'a-spam', 'b--d']
912 assert pc.binary_join_element_wise(
913 'a', 'b', '-', options=replace).as_py() == 'a-b'
914 assert pc.binary_join_element_wise(
915 'a', null, '-', options=replace).as_py() == 'a-spam'
916 assert pc.binary_join_element_wise(
917 'a', 'b', null, options=replace).as_py() is None
918
919
920 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
921 def test_take(ty, values):
922 arr = pa.array(values, type=ty)
923 for indices_type in [pa.int8(), pa.int64()]:
924 indices = pa.array([0, 4, 2, None], type=indices_type)
925 result = arr.take(indices)
926 result.validate()
927 expected = pa.array([values[0], values[4], values[2], None], type=ty)
928 assert result.equals(expected)
929
930 # empty indices
931 indices = pa.array([], type=indices_type)
932 result = arr.take(indices)
933 result.validate()
934 expected = pa.array([], type=ty)
935 assert result.equals(expected)
936
937 indices = pa.array([2, 5])
938 with pytest.raises(IndexError):
939 arr.take(indices)
940
941 indices = pa.array([2, -1])
942 with pytest.raises(IndexError):
943 arr.take(indices)
944
945
946 def test_take_indices_types():
947 arr = pa.array(range(5))
948
949 for indices_type in ['uint8', 'int8', 'uint16', 'int16',
950 'uint32', 'int32', 'uint64', 'int64']:
951 indices = pa.array([0, 4, 2, None], type=indices_type)
952 result = arr.take(indices)
953 result.validate()
954 expected = pa.array([0, 4, 2, None])
955 assert result.equals(expected)
956
957 for indices_type in [pa.float32(), pa.float64()]:
958 indices = pa.array([0, 4, 2], type=indices_type)
959 with pytest.raises(NotImplementedError):
960 arr.take(indices)
961
962
963 def test_take_on_chunked_array():
964 # ARROW-9504
965 arr = pa.chunked_array([
966 [
967 "a",
968 "b",
969 "c",
970 "d",
971 "e"
972 ],
973 [
974 "f",
975 "g",
976 "h",
977 "i",
978 "j"
979 ]
980 ])
981
982 indices = np.array([0, 5, 1, 6, 9, 2])
983 result = arr.take(indices)
984 expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
985 assert result.equals(expected)
986
987 indices = pa.chunked_array([[1], [9, 2]])
988 result = arr.take(indices)
989 expected = pa.chunked_array([
990 [
991 "b"
992 ],
993 [
994 "j",
995 "c"
996 ]
997 ])
998 assert result.equals(expected)
999
1000
1001 @pytest.mark.parametrize('ordered', [False, True])
1002 def test_take_dictionary(ordered):
1003 arr = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
1004 ordered=ordered)
1005 result = arr.take(pa.array([0, 1, 3]))
1006 result.validate()
1007 assert result.to_pylist() == ['a', 'b', 'a']
1008 assert result.dictionary.to_pylist() == ['a', 'b', 'c']
1009 assert result.type.ordered is ordered
1010
1011
1012 def test_take_null_type():
1013 # ARROW-10027
1014 arr = pa.array([None] * 10)
1015 chunked_arr = pa.chunked_array([[None] * 5] * 2)
1016 batch = pa.record_batch([arr], names=['a'])
1017 table = pa.table({'a': arr})
1018
1019 indices = pa.array([1, 3, 7, None])
1020 assert len(arr.take(indices)) == 4
1021 assert len(chunked_arr.take(indices)) == 4
1022 assert len(batch.take(indices).column(0)) == 4
1023 assert len(table.take(indices).column(0)) == 4
1024
1025
1026 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
1027 def test_drop_null(ty, values):
1028 arr = pa.array(values, type=ty)
1029 result = arr.drop_null()
1030 result.validate(full=True)
1031 indices = [i for i in range(len(arr)) if arr[i].is_valid]
1032 expected = arr.take(pa.array(indices))
1033 assert result.equals(expected)
1034
1035
1036 def test_drop_null_chunked_array():
1037 arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []])
1038 expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []])
1039
1040 result = arr.drop_null()
1041 assert result.equals(expected_drop)
1042
1043
1044 def test_drop_null_record_batch():
1045 batch = pa.record_batch(
1046 [pa.array(["a", None, "c", "d", None])], names=["a'"])
1047 result = batch.drop_null()
1048 expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
1049 assert result.equals(expected)
1050
1051 batch = pa.record_batch(
1052 [pa.array(["a", None, "c", "d", None]),
1053 pa.array([None, None, "c", None, "e"])], names=["a'", "b'"])
1054
1055 result = batch.drop_null()
1056 expected = pa.record_batch(
1057 [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"])
1058 assert result.equals(expected)
1059
1060
1061 def test_drop_null_table():
1062 table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"])
1063 expected = pa.table([pa.array(["a", "c", "d"])], names=["a"])
1064 result = table.drop_null()
1065 assert result.equals(expected)
1066
1067 table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]),
1068 pa.chunked_array([["a", None], [None, "d", None]]),
1069 pa.chunked_array([["a"], ["b"], [None], ["d", None]])],
1070 names=["a", "b", "c"])
1071 expected = pa.table([pa.array(["a", "d"]),
1072 pa.array(["a", "d"]),
1073 pa.array(["a", "d"])],
1074 names=["a", "b", "c"])
1075 result = table.drop_null()
1076 assert result.equals(expected)
1077
1078 table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]),
1079 pa.chunked_array([["A"], ["B"], [None], ["D", None]]),
1080 pa.chunked_array([["a`", None], ["c`", "d`", None]])],
1081 names=["a", "b", "c"])
1082 expected = pa.table([pa.array(["a", "d"]),
1083 pa.array(["A", "D"]),
1084 pa.array(["a`", "d`"])],
1085 names=["a", "b", "c"])
1086 result = table.drop_null()
1087 assert result.equals(expected)
1088
1089
1090 def test_drop_null_null_type():
1091 arr = pa.array([None] * 10)
1092 chunked_arr = pa.chunked_array([[None] * 5] * 2)
1093 batch = pa.record_batch([arr], names=['a'])
1094 table = pa.table({'a': arr})
1095
1096 assert len(arr.drop_null()) == 0
1097 assert len(chunked_arr.drop_null()) == 0
1098 assert len(batch.drop_null().column(0)) == 0
1099 assert len(table.drop_null().column(0)) == 0
1100
1101
1102 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
1103 def test_filter(ty, values):
1104 arr = pa.array(values, type=ty)
1105
1106 mask = pa.array([True, False, False, True, None])
1107 result = arr.filter(mask, null_selection_behavior='drop')
1108 result.validate()
1109 assert result.equals(pa.array([values[0], values[3]], type=ty))
1110 result = arr.filter(mask, null_selection_behavior='emit_null')
1111 result.validate()
1112 assert result.equals(pa.array([values[0], values[3], None], type=ty))
1113
1114 # non-boolean dtype
1115 mask = pa.array([0, 1, 0, 1, 0])
1116 with pytest.raises(NotImplementedError):
1117 arr.filter(mask)
1118
1119 # wrong length
1120 mask = pa.array([True, False, True])
1121 with pytest.raises(ValueError, match="must all be the same length"):
1122 arr.filter(mask)
1123
1124
1125 def test_filter_chunked_array():
1126 arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
1127 expected_drop = pa.chunked_array([["a"], ["e"]])
1128 expected_null = pa.chunked_array([["a"], [None, "e"]])
1129
1130 for mask in [
1131 # mask is array
1132 pa.array([True, False, None, False, True]),
1133 # mask is chunked array
1134 pa.chunked_array([[True, False, None], [False, True]]),
1135 # mask is python object
1136 [True, False, None, False, True]
1137 ]:
1138 result = arr.filter(mask)
1139 assert result.equals(expected_drop)
1140 result = arr.filter(mask, null_selection_behavior="emit_null")
1141 assert result.equals(expected_null)
1142
1143
1144 def test_filter_record_batch():
1145 batch = pa.record_batch(
1146 [pa.array(["a", None, "c", "d", "e"])], names=["a'"])
1147
1148 # mask is array
1149 mask = pa.array([True, False, None, False, True])
1150 result = batch.filter(mask)
1151 expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"])
1152 assert result.equals(expected)
1153
1154 result = batch.filter(mask, null_selection_behavior="emit_null")
1155 expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"])
1156 assert result.equals(expected)
1157
1158
1159 def test_filter_table():
1160 table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
1161 expected_drop = pa.table([pa.array(["a", "e"])], names=["a"])
1162 expected_null = pa.table([pa.array(["a", None, "e"])], names=["a"])
1163
1164 for mask in [
1165 # mask is array
1166 pa.array([True, False, None, False, True]),
1167 # mask is chunked array
1168 pa.chunked_array([[True, False], [None, False, True]]),
1169 # mask is python object
1170 [True, False, None, False, True]
1171 ]:
1172 result = table.filter(mask)
1173 assert result.equals(expected_drop)
1174 result = table.filter(mask, null_selection_behavior="emit_null")
1175 assert result.equals(expected_null)
1176
1177
1178 def test_filter_errors():
1179 arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
1180 batch = pa.record_batch(
1181 [pa.array(["a", None, "c", "d", "e"])], names=["a'"])
1182 table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
1183
1184 for obj in [arr, batch, table]:
1185 # non-boolean dtype
1186 mask = pa.array([0, 1, 0, 1, 0])
1187 with pytest.raises(NotImplementedError):
1188 obj.filter(mask)
1189
1190 # wrong length
1191 mask = pa.array([True, False, True])
1192 with pytest.raises(pa.ArrowInvalid,
1193 match="must all be the same length"):
1194 obj.filter(mask)
1195
1196
1197 def test_filter_null_type():
1198 # ARROW-10027
1199 arr = pa.array([None] * 10)
1200 chunked_arr = pa.chunked_array([[None] * 5] * 2)
1201 batch = pa.record_batch([arr], names=['a'])
1202 table = pa.table({'a': arr})
1203
1204 mask = pa.array([True, False] * 5)
1205 assert len(arr.filter(mask)) == 5
1206 assert len(chunked_arr.filter(mask)) == 5
1207 assert len(batch.filter(mask).column(0)) == 5
1208 assert len(table.filter(mask).column(0)) == 5
1209
1210
1211 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1212 def test_compare_array(typ):
1213 if typ == "array":
1214 def con(values):
1215 return pa.array(values)
1216 else:
1217 def con(values):
1218 return pa.chunked_array([values])
1219
1220 arr1 = con([1, 2, 3, 4, None])
1221 arr2 = con([1, 1, 4, None, 4])
1222
1223 result = pc.equal(arr1, arr2)
1224 assert result.equals(con([True, False, False, None, None]))
1225
1226 result = pc.not_equal(arr1, arr2)
1227 assert result.equals(con([False, True, True, None, None]))
1228
1229 result = pc.less(arr1, arr2)
1230 assert result.equals(con([False, False, True, None, None]))
1231
1232 result = pc.less_equal(arr1, arr2)
1233 assert result.equals(con([True, False, True, None, None]))
1234
1235 result = pc.greater(arr1, arr2)
1236 assert result.equals(con([False, True, False, None, None]))
1237
1238 result = pc.greater_equal(arr1, arr2)
1239 assert result.equals(con([True, True, False, None, None]))
1240
1241
1242 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1243 def test_compare_string_scalar(typ):
1244 if typ == "array":
1245 def con(values):
1246 return pa.array(values)
1247 else:
1248 def con(values):
1249 return pa.chunked_array([values])
1250
1251 arr = con(['a', 'b', 'c', None])
1252 scalar = pa.scalar('b')
1253
1254 result = pc.equal(arr, scalar)
1255 assert result.equals(con([False, True, False, None]))
1256
1257 if typ == "array":
1258 nascalar = pa.scalar(None, type="string")
1259 result = pc.equal(arr, nascalar)
1260 isnull = pc.is_null(result)
1261 assert isnull.equals(con([True, True, True, True]))
1262
1263 result = pc.not_equal(arr, scalar)
1264 assert result.equals(con([True, False, True, None]))
1265
1266 result = pc.less(arr, scalar)
1267 assert result.equals(con([True, False, False, None]))
1268
1269 result = pc.less_equal(arr, scalar)
1270 assert result.equals(con([True, True, False, None]))
1271
1272 result = pc.greater(arr, scalar)
1273 assert result.equals(con([False, False, True, None]))
1274
1275 result = pc.greater_equal(arr, scalar)
1276 assert result.equals(con([False, True, True, None]))
1277
1278
1279 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1280 def test_compare_scalar(typ):
1281 if typ == "array":
1282 def con(values):
1283 return pa.array(values)
1284 else:
1285 def con(values):
1286 return pa.chunked_array([values])
1287
1288 arr = con([1, 2, 3, None])
1289 scalar = pa.scalar(2)
1290
1291 result = pc.equal(arr, scalar)
1292 assert result.equals(con([False, True, False, None]))
1293
1294 if typ == "array":
1295 nascalar = pa.scalar(None, type="int64")
1296 result = pc.equal(arr, nascalar)
1297 assert result.to_pylist() == [None, None, None, None]
1298
1299 result = pc.not_equal(arr, scalar)
1300 assert result.equals(con([True, False, True, None]))
1301
1302 result = pc.less(arr, scalar)
1303 assert result.equals(con([True, False, False, None]))
1304
1305 result = pc.less_equal(arr, scalar)
1306 assert result.equals(con([True, True, False, None]))
1307
1308 result = pc.greater(arr, scalar)
1309 assert result.equals(con([False, False, True, None]))
1310
1311 result = pc.greater_equal(arr, scalar)
1312 assert result.equals(con([False, True, True, None]))
1313
1314
1315 def test_compare_chunked_array_mixed():
1316 arr = pa.array([1, 2, 3, 4, None])
1317 arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]])
1318 arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]])
1319
1320 expected = pa.chunked_array([[True, True, True, True, None]])
1321
1322 for left, right in [
1323 (arr, arr_chunked),
1324 (arr_chunked, arr),
1325 (arr_chunked, arr_chunked2),
1326 ]:
1327 result = pc.equal(left, right)
1328 assert result.equals(expected)
1329
1330
1331 def test_arithmetic_add():
1332 left = pa.array([1, 2, 3, 4, 5])
1333 right = pa.array([0, -1, 1, 2, 3])
1334 result = pc.add(left, right)
1335 expected = pa.array([1, 1, 4, 6, 8])
1336 assert result.equals(expected)
1337
1338
1339 def test_arithmetic_subtract():
1340 left = pa.array([1, 2, 3, 4, 5])
1341 right = pa.array([0, -1, 1, 2, 3])
1342 result = pc.subtract(left, right)
1343 expected = pa.array([1, 3, 2, 2, 2])
1344 assert result.equals(expected)
1345
1346
1347 def test_arithmetic_multiply():
1348 left = pa.array([1, 2, 3, 4, 5])
1349 right = pa.array([0, -1, 1, 2, 3])
1350 result = pc.multiply(left, right)
1351 expected = pa.array([0, -2, 3, 8, 15])
1352 assert result.equals(expected)
1353
1354
1355 @pytest.mark.parametrize("ty", ["round", "round_to_multiple"])
1356 def test_round_to_integer(ty):
1357 if ty == "round":
1358 round = pc.round
1359 RoundOptions = partial(pc.RoundOptions, ndigits=0)
1360 elif ty == "round_to_multiple":
1361 round = pc.round_to_multiple
1362 RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1)
1363
1364 values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None]
1365 rmode_and_expected = {
1366 "down": [3, 3, 3, 4, -4, -4, -4, None],
1367 "up": [4, 4, 4, 5, -3, -3, -3, None],
1368 "towards_zero": [3, 3, 3, 4, -3, -3, -3, None],
1369 "towards_infinity": [4, 4, 4, 5, -4, -4, -4, None],
1370 "half_down": [3, 3, 4, 4, -3, -4, -4, None],
1371 "half_up": [3, 4, 4, 5, -3, -3, -4, None],
1372 "half_towards_zero": [3, 3, 4, 4, -3, -3, -4, None],
1373 "half_towards_infinity": [3, 4, 4, 5, -3, -4, -4, None],
1374 "half_to_even": [3, 4, 4, 4, -3, -4, -4, None],
1375 "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None],
1376 }
1377 for round_mode, expected in rmode_and_expected.items():
1378 options = RoundOptions(round_mode=round_mode)
1379 result = round(values, options=options)
1380 np.testing.assert_array_equal(result, pa.array(expected))
1381
1382
1383 def test_round():
1384 values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
1385 ndigits_and_expected = {
1386 -2: [300, 0, 0, 0, -0, -0, -0, None],
1387 -1: [320, 0, 0, 0, -0, -40, -0, None],
1388 0: [320, 4, 3, 5, -3, -35, -3, None],
1389 1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
1390 2: [320, 3.5, 3.08, 4.5, -3.21, -35.12, -3.05, None],
1391 }
1392 for ndigits, expected in ndigits_and_expected.items():
1393 options = pc.RoundOptions(ndigits, "half_towards_infinity")
1394 result = pc.round(values, options=options)
1395 np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
1396
1397
1398 def test_round_to_multiple():
1399 values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
1400 multiple_and_expected = {
1401 2: [320, 4, 4, 4, -4, -36, -4, None],
1402 0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None],
1403 0.1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
1404 10: [320, 0, 0, 0, -0, -40, -0, None],
1405 100: [300, 0, 0, 0, -0, -0, -0, None],
1406 }
1407 for multiple, expected in multiple_and_expected.items():
1408 options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity")
1409 result = pc.round_to_multiple(values, options=options)
1410 np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
1411
1412 with pytest.raises(pa.ArrowInvalid, match="multiple must be positive"):
1413 pc.round_to_multiple(values, multiple=-2)
1414
1415
1416 def test_is_null():
1417 arr = pa.array([1, 2, 3, None])
1418 result = arr.is_null()
1419 expected = pa.array([False, False, False, True])
1420 assert result.equals(expected)
1421 assert result.equals(pc.is_null(arr))
1422 result = arr.is_valid()
1423 expected = pa.array([True, True, True, False])
1424 assert result.equals(expected)
1425 assert result.equals(pc.is_valid(arr))
1426
1427 arr = pa.chunked_array([[1, 2], [3, None]])
1428 result = arr.is_null()
1429 expected = pa.chunked_array([[False, False], [False, True]])
1430 assert result.equals(expected)
1431 result = arr.is_valid()
1432 expected = pa.chunked_array([[True, True], [True, False]])
1433 assert result.equals(expected)
1434
1435 arr = pa.array([1, 2, 3, None, np.nan])
1436 result = arr.is_null()
1437 expected = pa.array([False, False, False, True, False])
1438 assert result.equals(expected)
1439
1440 result = arr.is_null(nan_is_null=True)
1441 expected = pa.array([False, False, False, True, True])
1442 assert result.equals(expected)
1443
1444
1445 def test_fill_null():
1446 arr = pa.array([1, 2, None, 4], type=pa.int8())
1447 fill_value = pa.array([5], type=pa.int8())
1448 with pytest.raises(pa.ArrowInvalid,
1449 match="Array arguments must all be the same length"):
1450 arr.fill_null(fill_value)
1451
1452 arr = pa.array([None, None, None, None], type=pa.null())
1453 fill_value = pa.scalar(None, type=pa.null())
1454 result = arr.fill_null(fill_value)
1455 expected = pa.array([None, None, None, None])
1456 assert result.equals(expected)
1457
1458 arr = pa.array(['a', 'bb', None])
1459 result = arr.fill_null('ccc')
1460 expected = pa.array(['a', 'bb', 'ccc'])
1461 assert result.equals(expected)
1462
1463 arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
1464 result = arr.fill_null('ccc')
1465 expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
1466 assert result.equals(expected)
1467
1468 arr = pa.array(['a', 'bb', None])
1469 result = arr.fill_null(None)
1470 expected = pa.array(['a', 'bb', None])
1471 assert result.equals(expected)
1472
1473
1474 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
1475 def test_fill_null_array(arrow_type):
1476 arr = pa.array([1, 2, None, 4], type=arrow_type)
1477 fill_value = pa.scalar(5, type=arrow_type)
1478 result = arr.fill_null(fill_value)
1479 expected = pa.array([1, 2, 5, 4], type=arrow_type)
1480 assert result.equals(expected)
1481
1482 # Implicit conversions
1483 result = arr.fill_null(5)
1484 assert result.equals(expected)
1485
1486 # ARROW-9451: Unsigned integers allow this for some reason
1487 if not pa.types.is_unsigned_integer(arr.type):
1488 with pytest.raises((ValueError, TypeError)):
1489 arr.fill_null('5')
1490
1491 result = arr.fill_null(pa.scalar(5, type='int8'))
1492 assert result.equals(expected)
1493
1494
1495 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
1496 def test_fill_null_chunked_array(arrow_type):
1497 fill_value = pa.scalar(5, type=arrow_type)
1498 arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)])
1499 result = arr.fill_null(fill_value)
1500 expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)])
1501 assert result.equals(expected)
1502
1503 arr = pa.chunked_array([
1504 pa.array([1, 2], type=arrow_type),
1505 pa.array([], type=arrow_type),
1506 pa.array([None, 4], type=arrow_type)
1507 ])
1508 expected = pa.chunked_array([
1509 pa.array([1, 2], type=arrow_type),
1510 pa.array([], type=arrow_type),
1511 pa.array([5, 4], type=arrow_type)
1512 ])
1513 result = arr.fill_null(fill_value)
1514 assert result.equals(expected)
1515
1516 # Implicit conversions
1517 result = arr.fill_null(5)
1518 assert result.equals(expected)
1519
1520 result = arr.fill_null(pa.scalar(5, type='int8'))
1521 assert result.equals(expected)
1522
1523
1524 def test_logical():
1525 a = pa.array([True, False, False, None])
1526 b = pa.array([True, True, False, True])
1527
1528 assert pc.and_(a, b) == pa.array([True, False, False, None])
1529 assert pc.and_kleene(a, b) == pa.array([True, False, False, None])
1530
1531 assert pc.or_(a, b) == pa.array([True, True, False, None])
1532 assert pc.or_kleene(a, b) == pa.array([True, True, False, True])
1533
1534 assert pc.xor(a, b) == pa.array([False, True, False, None])
1535
1536 assert pc.invert(a) == pa.array([False, True, True, None])
1537
1538
1539 def test_cast():
1540 arr = pa.array([2 ** 63 - 1], type='int64')
1541
1542 with pytest.raises(pa.ArrowInvalid):
1543 pc.cast(arr, 'int32')
1544
1545 assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32')
1546
1547 arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
1548 expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
1549 assert pc.cast(arr, 'timestamp[ms]') == expected
1550
1551 arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8()))
1552 expected = pa.array([["1", "2"], ["3", "4", "5"]],
1553 type=pa.list_(pa.utf8()))
1554 assert pc.cast(arr, expected.type) == expected
1555
1556
1557 def test_strptime():
1558 arr = pa.array(["5/1/2020", None, "12/13/1900"])
1559
1560 got = pc.strptime(arr, format='%m/%d/%Y', unit='s')
1561 expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)],
1562 type=pa.timestamp('s'))
1563 assert got == expected
1564
1565
1566 # TODO: We should test on windows once ARROW-13168 is resolved.
1567 @pytest.mark.pandas
1568 @pytest.mark.skipif(sys.platform == 'win32',
1569 reason="Timezone database is not available on Windows yet")
1570 def test_strftime():
1571 from pyarrow.vendored.version import Version
1572
1573 def _fix_timestamp(s):
1574 if Version(pd.__version__) < Version("1.0.0"):
1575 return s.to_series().replace("NaT", pd.NaT)
1576 else:
1577 return s
1578
1579 times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
1580 timezones = ["CET", "UTC", "Europe/Ljubljana"]
1581
1582 formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H",
1583 "%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x",
1584 "%X", "%%", "%G", "%V", "%u"]
1585
1586 for timezone in timezones:
1587 ts = pd.to_datetime(times).tz_localize(timezone)
1588 for unit in ["s", "ms", "us", "ns"]:
1589 tsa = pa.array(ts, type=pa.timestamp(unit, timezone))
1590 for fmt in formats:
1591 options = pc.StrftimeOptions(fmt)
1592 result = pc.strftime(tsa, options=options)
1593 expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1594 assert result.equals(expected)
1595
1596 fmt = "%Y-%m-%dT%H:%M:%S"
1597
1598 # Default format
1599 tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1600 result = pc.strftime(tsa, options=pc.StrftimeOptions())
1601 expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1602 assert result.equals(expected)
1603
1604 # Default format plus timezone
1605 tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1606 result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
1607 expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
1608 assert result.equals(expected)
1609
1610 # Pandas %S is equivalent to %S in arrow for unit="s"
1611 tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1612 options = pc.StrftimeOptions("%S")
1613 result = pc.strftime(tsa, options=options)
1614 expected = pa.array(_fix_timestamp(ts.strftime("%S")))
1615 assert result.equals(expected)
1616
1617 # Pandas %S.%f is equivalent to %S in arrow for unit="us"
1618 tsa = pa.array(ts, type=pa.timestamp("us", timezone))
1619 options = pc.StrftimeOptions("%S")
1620 result = pc.strftime(tsa, options=options)
1621 expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
1622 assert result.equals(expected)
1623
1624 # Test setting locale
1625 tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1626 options = pc.StrftimeOptions(fmt, locale="C")
1627 result = pc.strftime(tsa, options=options)
1628 expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1629 assert result.equals(expected)
1630
1631 # Test timestamps without timezone
1632 fmt = "%Y-%m-%dT%H:%M:%S"
1633 ts = pd.to_datetime(times)
1634 tsa = pa.array(ts, type=pa.timestamp("s"))
1635 result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
1636 expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1637
1638 assert result.equals(expected)
1639 with pytest.raises(pa.ArrowInvalid,
1640 match="Timezone not present, cannot convert to string"):
1641 pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
1642 with pytest.raises(pa.ArrowInvalid,
1643 match="Timezone not present, cannot convert to string"):
1644 pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z"))
1645
1646
1647 def _check_datetime_components(timestamps, timezone=None):
1648 from pyarrow.vendored.version import Version
1649
1650 ts = pd.to_datetime(timestamps).tz_localize(
1651 "UTC").tz_convert(timezone).to_series()
1652 tsa = pa.array(ts, pa.timestamp("ns", tz=timezone))
1653
1654 subseconds = ((ts.dt.microsecond * 10 ** 3 +
1655 ts.dt.nanosecond) * 10 ** -9).round(9)
1656 iso_calendar_fields = [
1657 pa.field('iso_year', pa.int64()),
1658 pa.field('iso_week', pa.int64()),
1659 pa.field('iso_day_of_week', pa.int64())
1660 ]
1661
1662 if Version(pd.__version__) < Version("1.1.0"):
1663 # https://github.com/pandas-dev/pandas/issues/33206
1664 iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64")
1665 iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64")
1666 iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64")
1667 else:
1668 # Casting is required because pandas isocalendar returns int32
1669 # while arrow isocalendar returns int64.
1670 iso_year = ts.dt.isocalendar()["year"].astype("int64")
1671 iso_week = ts.dt.isocalendar()["week"].astype("int64")
1672 iso_day = ts.dt.isocalendar()["day"].astype("int64")
1673
1674 iso_calendar = pa.StructArray.from_arrays(
1675 [iso_year, iso_week, iso_day],
1676 fields=iso_calendar_fields)
1677
1678 assert pc.year(tsa).equals(pa.array(ts.dt.year))
1679 assert pc.month(tsa).equals(pa.array(ts.dt.month))
1680 assert pc.day(tsa).equals(pa.array(ts.dt.day))
1681 assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek))
1682 assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear))
1683 assert pc.iso_year(tsa).equals(pa.array(iso_year))
1684 assert pc.iso_week(tsa).equals(pa.array(iso_week))
1685 assert pc.iso_calendar(tsa).equals(iso_calendar)
1686 assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter))
1687 assert pc.hour(tsa).equals(pa.array(ts.dt.hour))
1688 assert pc.minute(tsa).equals(pa.array(ts.dt.minute))
1689 assert pc.second(tsa).equals(pa.array(ts.dt.second.values))
1690 assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3))
1691 assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3))
1692 assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond))
1693 assert pc.subsecond(tsa).equals(pa.array(subseconds))
1694
1695 day_of_week_options = pc.DayOfWeekOptions(
1696 count_from_zero=False, week_start=1)
1697 assert pc.day_of_week(tsa, options=day_of_week_options).equals(
1698 pa.array(ts.dt.dayofweek + 1))
1699
1700 week_options = pc.WeekOptions(
1701 week_starts_monday=True, count_from_zero=False,
1702 first_week_is_fully_in_year=False)
1703 assert pc.week(tsa, options=week_options).equals(pa.array(iso_week))
1704
1705
1706 @pytest.mark.pandas
1707 def test_extract_datetime_components():
1708 from pyarrow.vendored.version import Version
1709
1710 timestamps = ["1970-01-01T00:00:59.123456789",
1711 "2000-02-29T23:23:23.999999999",
1712 "2033-05-18T03:33:20.000000000",
1713 "2020-01-01T01:05:05.001",
1714 "2019-12-31T02:10:10.002",
1715 "2019-12-30T03:15:15.003",
1716 "2009-12-31T04:20:20.004132",
1717 "2010-01-01T05:25:25.005321",
1718 "2010-01-03T06:30:30.006163",
1719 "2010-01-04T07:35:35",
1720 "2006-01-01T08:40:40",
1721 "2005-12-31T09:45:45",
1722 "2008-12-28",
1723 "2008-12-29",
1724 "2012-01-01 01:02:03"]
1725 timezones = ["UTC", "US/Central", "Asia/Kolkata",
1726 "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"]
1727
1728 # Test timezone naive timestamp array
1729 _check_datetime_components(timestamps)
1730
1731 # Test timezone aware timestamp array
1732 if sys.platform == 'win32':
1733 # TODO: We should test on windows once ARROW-13168 is resolved.
1734 pytest.skip('Timezone database is not available on Windows yet')
1735 elif Version(pd.__version__) < Version('1.0.0'):
1736 pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
1737 else:
1738 for timezone in timezones:
1739 _check_datetime_components(timestamps, timezone)
1740
1741
1742 # TODO: We should test on windows once ARROW-13168 is resolved.
1743 @pytest.mark.pandas
1744 @pytest.mark.skipif(sys.platform == 'win32',
1745 reason="Timezone database is not available on Windows yet")
1746 def test_assume_timezone():
1747 from pyarrow.vendored.version import Version
1748
1749 ts_type = pa.timestamp("ns")
1750 timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
1751 "2000-02-29T23:23:23.999999999",
1752 "2033-05-18T03:33:20.000000000",
1753 "2020-01-01T01:05:05.001",
1754 "2019-12-31T02:10:10.002",
1755 "2019-12-30T03:15:15.003",
1756 "2009-12-31T04:20:20.004132",
1757 "2010-01-01T05:25:25.005321",
1758 "2010-01-03T06:30:30.006163",
1759 "2010-01-04T07:35:35",
1760 "2006-01-01T08:40:40",
1761 "2005-12-31T09:45:45",
1762 "2008-12-28",
1763 "2008-12-29",
1764 "2012-01-01 01:02:03"])
1765 nonexistent = pd.to_datetime(["2015-03-29 02:30:00",
1766 "2015-03-29 03:30:00"])
1767 ambiguous = pd.to_datetime(["2018-10-28 01:20:00",
1768 "2018-10-28 02:36:00",
1769 "2018-10-28 03:46:00"])
1770 ambiguous_array = pa.array(ambiguous, type=ts_type)
1771 nonexistent_array = pa.array(nonexistent, type=ts_type)
1772
1773 for timezone in ["UTC", "US/Central", "Asia/Kolkata"]:
1774 options = pc.AssumeTimezoneOptions(timezone)
1775 ta = pa.array(timestamps, type=ts_type)
1776 expected = timestamps.tz_localize(timezone)
1777 result = pc.assume_timezone(ta, options=options)
1778 assert result.equals(pa.array(expected))
1779
1780 ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone))
1781 with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"):
1782 pc.assume_timezone(ta_zoned, options=options)
1783
1784 invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss")
1785 with pytest.raises(ValueError, match="not found in timezone database"):
1786 pc.assume_timezone(ta, options=invalid_options)
1787
1788 timezone = "Europe/Brussels"
1789
1790 # nonexistent parameter was introduced in Pandas 0.24.0
1791 if Version(pd.__version__) >= Version("0.24.0"):
1792 options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
1793 options_nonexistent_earliest = pc.AssumeTimezoneOptions(
1794 timezone, ambiguous="raise", nonexistent="earliest")
1795 options_nonexistent_latest = pc.AssumeTimezoneOptions(
1796 timezone, ambiguous="raise", nonexistent="latest")
1797
1798 with pytest.raises(ValueError,
1799 match="Timestamp doesn't exist in "
1800 f"timezone '{timezone}'"):
1801 pc.assume_timezone(nonexistent_array,
1802 options=options_nonexistent_raise)
1803
1804 expected = pa.array(nonexistent.tz_localize(
1805 timezone, nonexistent="shift_forward"))
1806 result = pc.assume_timezone(
1807 nonexistent_array, options=options_nonexistent_latest)
1808 expected.equals(result)
1809
1810 expected = pa.array(nonexistent.tz_localize(
1811 timezone, nonexistent="shift_backward"))
1812 result = pc.assume_timezone(
1813 nonexistent_array, options=options_nonexistent_earliest)
1814 expected.equals(result)
1815
1816 options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
1817 options_ambiguous_latest = pc.AssumeTimezoneOptions(
1818 timezone, ambiguous="latest", nonexistent="raise")
1819 options_ambiguous_earliest = pc.AssumeTimezoneOptions(
1820 timezone, ambiguous="earliest", nonexistent="raise")
1821
1822 with pytest.raises(ValueError,
1823 match="Timestamp is ambiguous in "
1824 f"timezone '{timezone}'"):
1825 pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise)
1826
1827 expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True])
1828 result = pc.assume_timezone(
1829 ambiguous_array, options=options_ambiguous_earliest)
1830 result.equals(pa.array(expected))
1831
1832 expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False])
1833 result = pc.assume_timezone(
1834 ambiguous_array, options=options_ambiguous_latest)
1835 result.equals(pa.array(expected))
1836
1837
1838 def test_count():
1839 arr = pa.array([1, 2, 3, None, None])
1840 assert pc.count(arr).as_py() == 3
1841 assert pc.count(arr, mode='only_valid').as_py() == 3
1842 assert pc.count(arr, mode='only_null').as_py() == 2
1843 assert pc.count(arr, mode='all').as_py() == 5
1844
1845
1846 def test_index():
1847 arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
1848 assert pc.index(arr, pa.scalar(0)).as_py() == 0
1849 assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1
1850 assert pc.index(arr, 4).as_py() == 4
1851 assert arr.index(3, start=2).as_py() == 3
1852 assert arr.index(None).as_py() == -1
1853
1854 arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64())
1855 assert arr.index(1).as_py() == 0
1856 assert arr.index(1, start=2).as_py() == 2
1857 assert arr.index(1, start=1, end=2).as_py() == -1
1858
1859
1860 def check_partition_nth(data, indices, pivot, null_placement):
1861 indices = indices.to_pylist()
1862 assert len(indices) == len(data)
1863 assert sorted(indices) == list(range(len(data)))
1864 until_pivot = [data[indices[i]] for i in range(pivot)]
1865 after_pivot = [data[indices[i]] for i in range(pivot, len(data))]
1866 p = data[indices[pivot]]
1867 if p is None:
1868 if null_placement == "at_start":
1869 assert all(v is None for v in until_pivot)
1870 else:
1871 assert all(v is None for v in after_pivot)
1872 else:
1873 if null_placement == "at_start":
1874 assert all(v is None or v <= p for v in until_pivot)
1875 assert all(v >= p for v in after_pivot)
1876 else:
1877 assert all(v <= p for v in until_pivot)
1878 assert all(v is None or v >= p for v in after_pivot)
1879
1880
1881 def test_partition_nth():
1882 data = list(range(100, 140))
1883 random.shuffle(data)
1884 pivot = 10
1885 indices = pc.partition_nth_indices(data, pivot=pivot)
1886 check_partition_nth(data, indices, pivot, "at_end")
1887
1888
1889 def test_partition_nth_null_placement():
1890 data = list(range(10)) + [None] * 10
1891 random.shuffle(data)
1892
1893 for pivot in (0, 7, 13, 19):
1894 for null_placement in ("at_start", "at_end"):
1895 indices = pc.partition_nth_indices(data, pivot=pivot,
1896 null_placement=null_placement)
1897 check_partition_nth(data, indices, pivot, null_placement)
1898
1899
1900 def test_select_k_array():
1901 def validate_select_k(select_k_indices, arr, order, stable_sort=False):
1902 sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)])
1903 head_k_indices = sorted_indices.slice(0, len(select_k_indices))
1904 if stable_sort:
1905 assert select_k_indices == head_k_indices
1906 else:
1907 expected = pc.take(arr, head_k_indices)
1908 actual = pc.take(arr, select_k_indices)
1909 assert actual == expected
1910
1911 arr = pa.array([1, 2, None, 0])
1912 for k in [0, 2, 4]:
1913 for order in ["descending", "ascending"]:
1914 result = pc.select_k_unstable(
1915 arr, k=k, sort_keys=[("dummy", order)])
1916 validate_select_k(result, arr, order)
1917
1918 result = pc.top_k_unstable(arr, k=k)
1919 validate_select_k(result, arr, "descending")
1920
1921 result = pc.bottom_k_unstable(arr, k=k)
1922 validate_select_k(result, arr, "ascending")
1923
1924 result = pc.select_k_unstable(
1925 arr, options=pc.SelectKOptions(
1926 k=2, sort_keys=[("dummy", "descending")])
1927 )
1928 validate_select_k(result, arr, "descending")
1929
1930 result = pc.select_k_unstable(
1931 arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")])
1932 )
1933 validate_select_k(result, arr, "ascending")
1934
1935
1936 def test_select_k_table():
1937 def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False):
1938 sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys)
1939 head_k_indices = sorted_indices.slice(0, len(select_k_indices))
1940 if stable_sort:
1941 assert select_k_indices == head_k_indices
1942 else:
1943 expected = pc.take(tbl, head_k_indices)
1944 actual = pc.take(tbl, select_k_indices)
1945 assert actual == expected
1946
1947 table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]})
1948 for k in [0, 2, 4]:
1949 result = pc.select_k_unstable(
1950 table, k=k, sort_keys=[("a", "ascending")])
1951 validate_select_k(result, table, sort_keys=[("a", "ascending")])
1952
1953 result = pc.select_k_unstable(
1954 table, k=k, sort_keys=[("a", "ascending"), ("b", "ascending")])
1955 validate_select_k(
1956 result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
1957
1958 result = pc.top_k_unstable(table, k=k, sort_keys=["a"])
1959 validate_select_k(result, table, sort_keys=[("a", "descending")])
1960
1961 result = pc.bottom_k_unstable(table, k=k, sort_keys=["a", "b"])
1962 validate_select_k(
1963 result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
1964
1965 with pytest.raises(ValueError,
1966 match="select_k_unstable requires a nonnegative `k`"):
1967 pc.select_k_unstable(table)
1968
1969 with pytest.raises(ValueError,
1970 match="select_k_unstable requires a "
1971 "non-empty `sort_keys`"):
1972 pc.select_k_unstable(table, k=2, sort_keys=[])
1973
1974 with pytest.raises(ValueError, match="not a valid sort order"):
1975 pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")])
1976
1977 with pytest.raises(ValueError, match="Nonexistent sort key column"):
1978 pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")])
1979
1980
1981 def test_array_sort_indices():
1982 arr = pa.array([1, 2, None, 0])
1983 result = pc.array_sort_indices(arr)
1984 assert result.to_pylist() == [3, 0, 1, 2]
1985 result = pc.array_sort_indices(arr, order="ascending")
1986 assert result.to_pylist() == [3, 0, 1, 2]
1987 result = pc.array_sort_indices(arr, order="descending")
1988 assert result.to_pylist() == [1, 0, 3, 2]
1989 result = pc.array_sort_indices(arr, order="descending",
1990 null_placement="at_start")
1991 assert result.to_pylist() == [2, 1, 0, 3]
1992
1993 with pytest.raises(ValueError, match="not a valid sort order"):
1994 pc.array_sort_indices(arr, order="nonscending")
1995
1996
1997 def test_sort_indices_array():
1998 arr = pa.array([1, 2, None, 0])
1999 result = pc.sort_indices(arr)
2000 assert result.to_pylist() == [3, 0, 1, 2]
2001 result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")])
2002 assert result.to_pylist() == [3, 0, 1, 2]
2003 result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")])
2004 assert result.to_pylist() == [1, 0, 3, 2]
2005 result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")],
2006 null_placement="at_start")
2007 assert result.to_pylist() == [2, 1, 0, 3]
2008 # Using SortOptions
2009 result = pc.sort_indices(
2010 arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")])
2011 )
2012 assert result.to_pylist() == [1, 0, 3, 2]
2013 result = pc.sort_indices(
2014 arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")],
2015 null_placement="at_start")
2016 )
2017 assert result.to_pylist() == [2, 1, 0, 3]
2018
2019
2020 def test_sort_indices_table():
2021 table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]})
2022
2023 result = pc.sort_indices(table, sort_keys=[("a", "ascending")])
2024 assert result.to_pylist() == [3, 0, 1, 2]
2025 result = pc.sort_indices(table, sort_keys=[("a", "ascending")],
2026 null_placement="at_start")
2027 assert result.to_pylist() == [2, 3, 0, 1]
2028
2029 result = pc.sort_indices(
2030 table, sort_keys=[("a", "descending"), ("b", "ascending")]
2031 )
2032 assert result.to_pylist() == [1, 0, 3, 2]
2033 result = pc.sort_indices(
2034 table, sort_keys=[("a", "descending"), ("b", "ascending")],
2035 null_placement="at_start"
2036 )
2037 assert result.to_pylist() == [2, 1, 0, 3]
2038
2039 with pytest.raises(ValueError, match="Must specify one or more sort keys"):
2040 pc.sort_indices(table)
2041
2042 with pytest.raises(ValueError, match="Nonexistent sort key column"):
2043 pc.sort_indices(table, sort_keys=[("unknown", "ascending")])
2044
2045 with pytest.raises(ValueError, match="not a valid sort order"):
2046 pc.sort_indices(table, sort_keys=[("a", "nonscending")])
2047
2048
2049 def test_is_in():
2050 arr = pa.array([1, 2, None, 1, 2, 3])
2051
2052 result = pc.is_in(arr, value_set=pa.array([1, 3, None]))
2053 assert result.to_pylist() == [True, False, True, True, False, True]
2054
2055 result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True)
2056 assert result.to_pylist() == [True, False, False, True, False, True]
2057
2058 result = pc.is_in(arr, value_set=pa.array([1, 3]))
2059 assert result.to_pylist() == [True, False, False, True, False, True]
2060
2061 result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
2062 assert result.to_pylist() == [True, False, False, True, False, True]
2063
2064
2065 def test_index_in():
2066 arr = pa.array([1, 2, None, 1, 2, 3])
2067
2068 result = pc.index_in(arr, value_set=pa.array([1, 3, None]))
2069 assert result.to_pylist() == [0, None, 2, 0, None, 1]
2070
2071 result = pc.index_in(arr, value_set=pa.array([1, 3, None]),
2072 skip_nulls=True)
2073 assert result.to_pylist() == [0, None, None, 0, None, 1]
2074
2075 result = pc.index_in(arr, value_set=pa.array([1, 3]))
2076 assert result.to_pylist() == [0, None, None, 0, None, 1]
2077
2078 result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
2079 assert result.to_pylist() == [0, None, None, 0, None, 1]
2080
2081
2082 def test_quantile():
2083 arr = pa.array([1, 2, 3, 4])
2084
2085 result = pc.quantile(arr)
2086 assert result.to_pylist() == [2.5]
2087
2088 result = pc.quantile(arr, interpolation='lower')
2089 assert result.to_pylist() == [2]
2090 result = pc.quantile(arr, interpolation='higher')
2091 assert result.to_pylist() == [3]
2092 result = pc.quantile(arr, interpolation='nearest')
2093 assert result.to_pylist() == [3]
2094 result = pc.quantile(arr, interpolation='midpoint')
2095 assert result.to_pylist() == [2.5]
2096 result = pc.quantile(arr, interpolation='linear')
2097 assert result.to_pylist() == [2.5]
2098
2099 arr = pa.array([1, 2])
2100
2101 result = pc.quantile(arr, q=[0.25, 0.5, 0.75])
2102 assert result.to_pylist() == [1.25, 1.5, 1.75]
2103
2104 result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower')
2105 assert result.to_pylist() == [1, 1, 1]
2106 result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher')
2107 assert result.to_pylist() == [2, 2, 2]
2108 result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint')
2109 assert result.to_pylist() == [1.5, 1.5, 1.5]
2110 result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest')
2111 assert result.to_pylist() == [1, 1, 2]
2112 result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear')
2113 assert result.to_pylist() == [1.25, 1.5, 1.75]
2114
2115 with pytest.raises(ValueError, match="Quantile must be between 0 and 1"):
2116 pc.quantile(arr, q=1.1)
2117 with pytest.raises(ValueError, match="not a valid quantile interpolation"):
2118 pc.quantile(arr, interpolation='zzz')
2119
2120
2121 def test_tdigest():
2122 arr = pa.array([1, 2, 3, 4])
2123 result = pc.tdigest(arr)
2124 assert result.to_pylist() == [2.5]
2125
2126 arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
2127 result = pc.tdigest(arr)
2128 assert result.to_pylist() == [2.5]
2129
2130 arr = pa.array([1, 2, 3, 4])
2131 result = pc.tdigest(arr, q=[0, 0.5, 1])
2132 assert result.to_pylist() == [1, 2.5, 4]
2133
2134 arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
2135 result = pc.tdigest(arr, q=[0, 0.5, 1])
2136 assert result.to_pylist() == [1, 2.5, 4]
2137
2138
2139 def test_fill_null_segfault():
2140 # ARROW-12672
2141 arr = pa.array([None], pa.bool_()).fill_null(False)
2142 result = arr.cast(pa.int8())
2143 assert result == pa.array([0], pa.int8())
2144
2145
2146 def test_min_max_element_wise():
2147 arr1 = pa.array([1, 2, 3])
2148 arr2 = pa.array([3, 1, 2])
2149 arr3 = pa.array([2, 3, None])
2150
2151 result = pc.max_element_wise(arr1, arr2)
2152 assert result == pa.array([3, 2, 3])
2153 result = pc.min_element_wise(arr1, arr2)
2154 assert result == pa.array([1, 1, 2])
2155
2156 result = pc.max_element_wise(arr1, arr2, arr3)
2157 assert result == pa.array([3, 3, 3])
2158 result = pc.min_element_wise(arr1, arr2, arr3)
2159 assert result == pa.array([1, 1, 2])
2160
2161 # with specifying the option
2162 result = pc.max_element_wise(arr1, arr3, skip_nulls=True)
2163 assert result == pa.array([2, 3, 3])
2164 result = pc.min_element_wise(arr1, arr3, skip_nulls=True)
2165 assert result == pa.array([1, 2, 3])
2166 result = pc.max_element_wise(
2167 arr1, arr3, options=pc.ElementWiseAggregateOptions())
2168 assert result == pa.array([2, 3, 3])
2169 result = pc.min_element_wise(
2170 arr1, arr3, options=pc.ElementWiseAggregateOptions())
2171 assert result == pa.array([1, 2, 3])
2172
2173 # not skipping nulls
2174 result = pc.max_element_wise(arr1, arr3, skip_nulls=False)
2175 assert result == pa.array([2, 3, None])
2176 result = pc.min_element_wise(arr1, arr3, skip_nulls=False)
2177 assert result == pa.array([1, 2, None])
2178
2179
2180 def test_make_struct():
2181 assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'}
2182
2183 assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == {
2184 'i': 1, 's': 'a'}
2185
2186 assert pc.make_struct([1, 2, 3],
2187 "a b c".split()) == pa.StructArray.from_arrays([
2188 [1, 2, 3],
2189 "a b c".split()], names='0 1'.split())
2190
2191 with pytest.raises(ValueError,
2192 match="Array arguments must all be the same length"):
2193 pc.make_struct([1, 2, 3, 4], "a b c".split())
2194
2195 with pytest.raises(ValueError, match="0 arguments but 2 field names"):
2196 pc.make_struct(field_names=['one', 'two'])
2197
2198
2199 def test_case_when():
2200 assert pc.case_when(pc.make_struct([True, False, None],
2201 [False, True, None]),
2202 [1, 2, 3],
2203 [11, 12, 13]) == pa.array([1, 12, None])
2204
2205
2206 def test_list_element():
2207 element_type = pa.struct([('a', pa.float64()), ('b', pa.int8())])
2208 list_type = pa.list_(element_type)
2209 l1 = [{'a': .4, 'b': 2}, None, {'a': .2, 'b': 4}, None, {'a': 5.6, 'b': 6}]
2210 l2 = [None, {'a': .52, 'b': 3}, {'a': .7, 'b': 4}, None, {'a': .6, 'b': 8}]
2211 lists = pa.array([l1, l2], list_type)
2212
2213 index = 1
2214 result = pa.compute.list_element(lists, index)
2215 expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type)
2216 assert result.equals(expected)
2217
2218 index = 4
2219 result = pa.compute.list_element(lists, index)
2220 expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type)
2221 assert result.equals(expected)
2222
2223
2224 def test_count_distinct():
2225 seed = datetime.now()
2226 samples = [seed.replace(year=y) for y in range(1992, 2092)]
2227 arr = pa.array(samples, pa.timestamp("ns"))
2228 result = pa.compute.count_distinct(arr)
2229 expected = pa.scalar(len(samples), type=pa.int64())
2230 assert result.equals(expected)
2231
2232
2233 def test_count_distinct_options():
2234 arr = pa.array([1, 2, 3, None, None])
2235 assert pc.count_distinct(arr).as_py() == 3
2236 assert pc.count_distinct(arr, mode='only_valid').as_py() == 3
2237 assert pc.count_distinct(arr, mode='only_null').as_py() == 1
2238 assert pc.count_distinct(arr, mode='all').as_py() == 4