ceph/src/arrow/python/pyarrow/tests/test_compute.py

   1 # Licensed to the Apache Software Foundation (ASF) under one
   2 # or more contributor license agreements.  See the NOTICE file
   3 # distributed with this work for additional information
   4 # regarding copyright ownership.  The ASF licenses this file
   5 # to you under the Apache License, Version 2.0 (the
   6 # "License"); you may not use this file except in compliance
   7 # with the License.  You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing,
  12 # software distributed under the License is distributed on an
  13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 # KIND, either express or implied.  See the License for the
  15 # specific language governing permissions and limitations
  16 # under the License.
  17
  18 from datetime import datetime
  19 from functools import lru_cache, partial
  20 import inspect
  21 import pickle
  22 import pytest
  23 import random
  24 import sys
  25 import textwrap
  26
  27 import numpy as np
  28
  29 try:
  30     import pandas as pd
  31 except ImportError:
  32     pd = None
  33
  34 import pyarrow as pa
  35 import pyarrow.compute as pc
  36
  37 all_array_types = [
  38     ('bool', [True, False, False, True, True]),
  39     ('uint8', np.arange(5)),
  40     ('int8', np.arange(5)),
  41     ('uint16', np.arange(5)),
  42     ('int16', np.arange(5)),
  43     ('uint32', np.arange(5)),
  44     ('int32', np.arange(5)),
  45     ('uint64', np.arange(5, 10)),
  46     ('int64', np.arange(5, 10)),
  47     ('float', np.arange(0, 0.5, 0.1)),
  48     ('double', np.arange(0, 0.5, 0.1)),
  49     ('string', ['a', 'b', None, 'ddd', 'ee']),
  50     ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
  51     (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
  52     (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
  53     (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
  54     (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
  55         {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
  56 ]
  57
  58 exported_functions = [
  59     func for (name, func) in sorted(pc.__dict__.items())
  60     if hasattr(func, '__arrow_compute_function__')]
  61
  62 exported_option_classes = [
  63     cls for (name, cls) in sorted(pc.__dict__.items())
  64     if (isinstance(cls, type) and
  65         cls is not pc.FunctionOptions and
  66         issubclass(cls, pc.FunctionOptions))]
  67
  68 numerical_arrow_types = [
  69     pa.int8(),
  70     pa.int16(),
  71     pa.int64(),
  72     pa.uint8(),
  73     pa.uint16(),
  74     pa.uint64(),
  75     pa.float32(),
  76     pa.float64()
  77 ]
  78
  79
  80 def test_exported_functions():
  81     # Check that all exported concrete functions can be called with
  82     # the right number of arguments.
  83     # Note that unregistered functions (e.g. with a mismatching name)
  84     # will raise KeyError.
  85     functions = exported_functions
  86     assert len(functions) >= 10
  87     for func in functions:
  88         arity = func.__arrow_compute_function__['arity']
  89         if arity is Ellipsis:
  90             args = [object()] * 3
  91         else:
  92             args = [object()] * arity
  93         with pytest.raises(TypeError,
  94                            match="Got unexpected argument type "
  95                                  "<class 'object'> for compute function"):
  96             func(*args)
  97
  98
  99 def test_exported_option_classes():
 100     classes = exported_option_classes
 101     assert len(classes) >= 10
 102     for cls in classes:
 103         # Option classes must have an introspectable constructor signature,
 104         # and that signature should not have any *args or **kwargs.
 105         sig = inspect.signature(cls)
 106         for param in sig.parameters.values():
 107             assert param.kind not in (param.VAR_POSITIONAL,
 108                                       param.VAR_KEYWORD)
 109
 110
 111 def test_option_class_equality():
 112     options = [
 113         pc.ArraySortOptions(),
 114         pc.AssumeTimezoneOptions("UTC"),
 115         pc.CastOptions.safe(pa.int8()),
 116         pc.CountOptions(),
 117         pc.DayOfWeekOptions(count_from_zero=False, week_start=0),
 118         pc.DictionaryEncodeOptions(),
 119         pc.ElementWiseAggregateOptions(skip_nulls=True),
 120         pc.ExtractRegexOptions("pattern"),
 121         pc.FilterOptions(),
 122         pc.IndexOptions(pa.scalar(1)),
 123         pc.JoinOptions(),
 124         pc.MakeStructOptions(["field", "names"],
 125                              field_nullability=[True, True],
 126                              field_metadata=[pa.KeyValueMetadata({"a": "1"}),
 127                                              pa.KeyValueMetadata({"b": "2"})]),
 128         pc.MatchSubstringOptions("pattern"),
 129         pc.ModeOptions(),
 130         pc.NullOptions(),
 131         pc.PadOptions(5),
 132         pc.PartitionNthOptions(1, null_placement="at_start"),
 133         pc.QuantileOptions(),
 134         pc.ReplaceSliceOptions(0, 1, "a"),
 135         pc.ReplaceSubstringOptions("a", "b"),
 136         pc.RoundOptions(2, "towards_infinity"),
 137         pc.RoundToMultipleOptions(100, "towards_infinity"),
 138         pc.ScalarAggregateOptions(),
 139         pc.SelectKOptions(0, sort_keys=[("b", "ascending")]),
 140         pc.SetLookupOptions(pa.array([1])),
 141         pc.SliceOptions(0, 1, 1),
 142         pc.SortOptions([("dummy", "descending")], null_placement="at_start"),
 143         pc.SplitOptions(),
 144         pc.SplitPatternOptions("pattern"),
 145         pc.StrftimeOptions(),
 146         pc.StrptimeOptions("%Y", "s"),
 147         pc.TakeOptions(),
 148         pc.TDigestOptions(),
 149         pc.TrimOptions(" "),
 150         pc.VarianceOptions(),
 151         pc.WeekOptions(week_starts_monday=True, count_from_zero=False,
 152                        first_week_is_fully_in_year=False),
 153     ]
 154     # TODO: We should test on windows once ARROW-13168 is resolved.
 155     # Timezone database is not available on Windows yet
 156     if sys.platform != 'win32':
 157         options.append(pc.AssumeTimezoneOptions("Europe/Ljubljana"))
 158
 159     classes = {type(option) for option in options}
 160     for cls in exported_option_classes:
 161         # Timezone database is not available on Windows yet
 162         if cls not in classes and sys.platform != 'win32' and \
 163                 cls != pc.AssumeTimezoneOptions:
 164             try:
 165                 options.append(cls())
 166             except TypeError:
 167                 pytest.fail(f"Options class is not tested: {cls}")
 168     for option in options:
 169         assert option == option
 170         assert repr(option).startswith(option.__class__.__name__)
 171         buf = option.serialize()
 172         deserialized = pc.FunctionOptions.deserialize(buf)
 173         assert option == deserialized
 174         assert repr(option) == repr(deserialized)
 175     for option1, option2 in zip(options, options[1:]):
 176         assert option1 != option2
 177
 178     assert repr(pc.IndexOptions(pa.scalar(1))) == "IndexOptions(value=int64:1)"
 179     assert repr(pc.ArraySortOptions()) == \
 180         "ArraySortOptions(order=Ascending, null_placement=AtEnd)"
 181
 182
 183 def test_list_functions():
 184     assert len(pc.list_functions()) > 10
 185     assert "add" in pc.list_functions()
 186
 187
 188 def _check_get_function(name, expected_func_cls, expected_ker_cls,
 189                         min_num_kernels=1):
 190     func = pc.get_function(name)
 191     assert isinstance(func, expected_func_cls)
 192     n = func.num_kernels
 193     assert n >= min_num_kernels
 194     assert n == len(func.kernels)
 195     assert all(isinstance(ker, expected_ker_cls) for ker in func.kernels)
 196
 197
 198 def test_get_function_scalar():
 199     _check_get_function("add", pc.ScalarFunction, pc.ScalarKernel, 8)
 200
 201
 202 def test_get_function_vector():
 203     _check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8)
 204
 205
 206 def test_get_function_scalar_aggregate():
 207     _check_get_function("mean", pc.ScalarAggregateFunction,
 208                         pc.ScalarAggregateKernel, 8)
 209
 210
 211 def test_get_function_hash_aggregate():
 212     _check_get_function("hash_sum", pc.HashAggregateFunction,
 213                         pc.HashAggregateKernel, 1)
 214
 215
 216 def test_call_function_with_memory_pool():
 217     arr = pa.array(["foo", "bar", "baz"])
 218     indices = np.array([2, 2, 1])
 219     result1 = arr.take(indices)
 220     result2 = pc.call_function('take', [arr, indices],
 221                                memory_pool=pa.default_memory_pool())
 222     expected = pa.array(["baz", "baz", "bar"])
 223     assert result1.equals(expected)
 224     assert result2.equals(expected)
 225
 226     result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool())
 227     assert result3.equals(expected)
 228
 229
 230 def test_pickle_functions():
 231     # Pickle registered functions
 232     for name in pc.list_functions():
 233         func = pc.get_function(name)
 234         reconstructed = pickle.loads(pickle.dumps(func))
 235         assert type(reconstructed) is type(func)
 236         assert reconstructed.name == func.name
 237         assert reconstructed.arity == func.arity
 238         assert reconstructed.num_kernels == func.num_kernels
 239
 240
 241 def test_pickle_global_functions():
 242     # Pickle global wrappers (manual or automatic) of registered functions
 243     for name in pc.list_functions():
 244         func = getattr(pc, name)
 245         reconstructed = pickle.loads(pickle.dumps(func))
 246         assert reconstructed is func
 247
 248
 249 def test_function_attributes():
 250     # Sanity check attributes of registered functions
 251     for name in pc.list_functions():
 252         func = pc.get_function(name)
 253         assert isinstance(func, pc.Function)
 254         assert func.name == name
 255         kernels = func.kernels
 256         assert func.num_kernels == len(kernels)
 257         assert all(isinstance(ker, pc.Kernel) for ker in kernels)
 258         if func.arity is not Ellipsis:
 259             assert func.arity >= 1
 260         repr(func)
 261         for ker in kernels:
 262             repr(ker)
 263
 264
 265 def test_input_type_conversion():
 266     # Automatic array conversion from Python
 267     arr = pc.add([1, 2], [4, None])
 268     assert arr.to_pylist() == [5, None]
 269     # Automatic scalar conversion from Python
 270     arr = pc.add([1, 2], 4)
 271     assert arr.to_pylist() == [5, 6]
 272     # Other scalar type
 273     assert pc.equal(["foo", "bar", None],
 274                     "foo").to_pylist() == [True, False, None]
 275
 276
 277 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
 278 def test_sum_array(arrow_type):
 279     arr = pa.array([1, 2, 3, 4], type=arrow_type)
 280     assert arr.sum().as_py() == 10
 281     assert pc.sum(arr).as_py() == 10
 282
 283     arr = pa.array([1, 2, 3, 4, None], type=arrow_type)
 284     assert arr.sum().as_py() == 10
 285     assert pc.sum(arr).as_py() == 10
 286
 287     arr = pa.array([None], type=arrow_type)
 288     assert arr.sum().as_py() is None  # noqa: E711
 289     assert pc.sum(arr).as_py() is None  # noqa: E711
 290     assert arr.sum(min_count=0).as_py() == 0
 291     assert pc.sum(arr, min_count=0).as_py() == 0
 292
 293     arr = pa.array([], type=arrow_type)
 294     assert arr.sum().as_py() is None  # noqa: E711
 295     assert arr.sum(min_count=0).as_py() == 0
 296     assert pc.sum(arr, min_count=0).as_py() == 0
 297
 298
 299 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
 300 def test_sum_chunked_array(arrow_type):
 301     arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)])
 302     assert pc.sum(arr).as_py() == 10
 303
 304     arr = pa.chunked_array([
 305         pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type)
 306     ])
 307     assert pc.sum(arr).as_py() == 10
 308
 309     arr = pa.chunked_array([
 310         pa.array([1, 2], type=arrow_type),
 311         pa.array([], type=arrow_type),
 312         pa.array([3, 4], type=arrow_type)
 313     ])
 314     assert pc.sum(arr).as_py() == 10
 315
 316     arr = pa.chunked_array((), type=arrow_type)
 317     assert arr.num_chunks == 0
 318     assert pc.sum(arr).as_py() is None  # noqa: E711
 319     assert pc.sum(arr, min_count=0).as_py() == 0
 320
 321
 322 def test_mode_array():
 323     # ARROW-9917
 324     arr = pa.array([1, 1, 3, 4, 3, 5], type='int64')
 325     mode = pc.mode(arr)
 326     assert len(mode) == 1
 327     assert mode[0].as_py() == {"mode": 1, "count": 2}
 328
 329     mode = pc.mode(arr, n=2)
 330     assert len(mode) == 2
 331     assert mode[0].as_py() == {"mode": 1, "count": 2}
 332     assert mode[1].as_py() == {"mode": 3, "count": 2}
 333
 334     arr = pa.array([], type='int64')
 335     assert len(pc.mode(arr)) == 0
 336
 337     arr = pa.array([1, 1, 3, 4, 3, None], type='int64')
 338     mode = pc.mode(arr, skip_nulls=False)
 339     assert len(mode) == 0
 340     mode = pc.mode(arr, min_count=6)
 341     assert len(mode) == 0
 342     mode = pc.mode(arr, skip_nulls=False, min_count=5)
 343     assert len(mode) == 0
 344
 345
 346 def test_mode_chunked_array():
 347     # ARROW-9917
 348     arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')])
 349     mode = pc.mode(arr)
 350     assert len(mode) == 1
 351     assert mode[0].as_py() == {"mode": 1, "count": 2}
 352
 353     mode = pc.mode(arr, n=2)
 354     assert len(mode) == 2
 355     assert mode[0].as_py() == {"mode": 1, "count": 2}
 356     assert mode[1].as_py() == {"mode": 3, "count": 2}
 357
 358     arr = pa.chunked_array((), type='int64')
 359     assert arr.num_chunks == 0
 360     assert len(pc.mode(arr)) == 0
 361
 362
 363 def test_variance():
 364     data = [1, 2, 3, 4, 5, 6, 7, 8]
 365     assert pc.variance(data).as_py() == 5.25
 366     assert pc.variance(data, ddof=0).as_py() == 5.25
 367     assert pc.variance(data, ddof=1).as_py() == 6.0
 368
 369
 370 def test_count_substring():
 371     for (ty, offset) in [(pa.string(), pa.int32()),
 372                          (pa.large_string(), pa.int64())]:
 373         arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty)
 374
 375         result = pc.count_substring(arr, "ab")
 376         expected = pa.array([1, 1, 2, 0, 0, None], type=offset)
 377         assert expected.equals(result)
 378
 379         result = pc.count_substring(arr, "ab", ignore_case=True)
 380         expected = pa.array([1, 1, 2, 0, 1, None], type=offset)
 381         assert expected.equals(result)
 382
 383
 384 def test_count_substring_regex():
 385     for (ty, offset) in [(pa.string(), pa.int32()),
 386                          (pa.large_string(), pa.int64())]:
 387         arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty)
 388
 389         result = pc.count_substring_regex(arr, "a+")
 390         expected = pa.array([1, 1, 3, 1, 0, None], type=offset)
 391         assert expected.equals(result)
 392
 393         result = pc.count_substring_regex(arr, "a+", ignore_case=True)
 394         expected = pa.array([1, 1, 2, 1, 1, None], type=offset)
 395         assert expected.equals(result)
 396
 397
 398 def test_find_substring():
 399     for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]:
 400         arr = pa.array(["ab", "cab", "ba", None], type=ty)
 401         result = pc.find_substring(arr, "ab")
 402         assert result.to_pylist() == [0, 1, -1, None]
 403
 404         result = pc.find_substring_regex(arr, "a?b")
 405         assert result.to_pylist() == [0, 1, 0, None]
 406
 407         arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty)
 408         result = pc.find_substring(arr, "aB*", ignore_case=True)
 409         assert result.to_pylist() == [0, 1, -1, -1]
 410
 411         result = pc.find_substring_regex(arr, "a?b", ignore_case=True)
 412         assert result.to_pylist() == [0, 1, 0, 0]
 413
 414
 415 def test_match_like():
 416     arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
 417     result = pc.match_like(arr, r"_a\%%")
 418     expected = pa.array([False, True, False, True, None])
 419     assert expected.equals(result)
 420
 421     arr = pa.array(["aB", "bA%", "ba", "ca%d", None])
 422     result = pc.match_like(arr, r"_a\%%", ignore_case=True)
 423     expected = pa.array([False, True, False, True, None])
 424     assert expected.equals(result)
 425     result = pc.match_like(arr, r"_a\%%", ignore_case=False)
 426     expected = pa.array([False, False, False, True, None])
 427     assert expected.equals(result)
 428
 429
 430 def test_match_substring():
 431     arr = pa.array(["ab", "abc", "ba", None])
 432     result = pc.match_substring(arr, "ab")
 433     expected = pa.array([True, True, False, None])
 434     assert expected.equals(result)
 435
 436     arr = pa.array(["áB", "Ábc", "ba", None])
 437     result = pc.match_substring(arr, "áb", ignore_case=True)
 438     expected = pa.array([True, True, False, None])
 439     assert expected.equals(result)
 440     result = pc.match_substring(arr, "áb", ignore_case=False)
 441     expected = pa.array([False, False, False, None])
 442     assert expected.equals(result)
 443
 444
 445 def test_match_substring_regex():
 446     arr = pa.array(["ab", "abc", "ba", "c", None])
 447     result = pc.match_substring_regex(arr, "^a?b")
 448     expected = pa.array([True, True, True, False, None])
 449     assert expected.equals(result)
 450
 451     arr = pa.array(["aB", "Abc", "BA", "c", None])
 452     result = pc.match_substring_regex(arr, "^a?b", ignore_case=True)
 453     expected = pa.array([True, True, True, False, None])
 454     assert expected.equals(result)
 455     result = pc.match_substring_regex(arr, "^a?b", ignore_case=False)
 456     expected = pa.array([False, False, False, False, None])
 457     assert expected.equals(result)
 458
 459
 460 def test_trim():
 461     # \u3000 is unicode whitespace
 462     arr = pa.array([" foo", None, " \u3000foo bar \t"])
 463     result = pc.utf8_trim_whitespace(arr)
 464     expected = pa.array(["foo", None, "foo bar"])
 465     assert expected.equals(result)
 466
 467     arr = pa.array([" foo", None, " \u3000foo bar \t"])
 468     result = pc.ascii_trim_whitespace(arr)
 469     expected = pa.array(["foo", None, "\u3000foo bar"])
 470     assert expected.equals(result)
 471
 472     arr = pa.array([" foo", None, " \u3000foo bar \t"])
 473     result = pc.utf8_trim(arr, characters=' f\u3000')
 474     expected = pa.array(["oo", None, "oo bar \t"])
 475     assert expected.equals(result)
 476
 477
 478 def test_slice_compatibility():
 479     arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])
 480     for start in range(-6, 6):
 481         for stop in range(-6, 6):
 482             for step in [-3, -2, -1, 1, 2, 3]:
 483                 expected = pa.array([k.as_py()[start:stop:step]
 484                                      for k in arr])
 485                 result = pc.utf8_slice_codeunits(
 486                     arr, start=start, stop=stop, step=step)
 487                 assert expected.equals(result)
 488
 489
 490 def test_split_pattern():
 491     arr = pa.array(["-foo---bar--", "---foo---b"])
 492     result = pc.split_pattern(arr, pattern="---")
 493     expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]])
 494     assert expected.equals(result)
 495
 496     result = pc.split_pattern(arr, pattern="---", max_splits=1)
 497     expected = pa.array([["-foo", "bar--"], ["", "foo---b"]])
 498     assert expected.equals(result)
 499
 500     result = pc.split_pattern(arr, pattern="---", max_splits=1, reverse=True)
 501     expected = pa.array([["-foo", "bar--"], ["---foo", "b"]])
 502     assert expected.equals(result)
 503
 504
 505 def test_split_whitespace_utf8():
 506     arr = pa.array(["foo bar", " foo  \u3000\tb"])
 507     result = pc.utf8_split_whitespace(arr)
 508     expected = pa.array([["foo", "bar"], ["", "foo", "b"]])
 509     assert expected.equals(result)
 510
 511     result = pc.utf8_split_whitespace(arr, max_splits=1)
 512     expected = pa.array([["foo", "bar"], ["", "foo  \u3000\tb"]])
 513     assert expected.equals(result)
 514
 515     result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True)
 516     expected = pa.array([["foo", "bar"], [" foo", "b"]])
 517     assert expected.equals(result)
 518
 519
 520 def test_split_whitespace_ascii():
 521     arr = pa.array(["foo bar", " foo  \u3000\tb"])
 522     result = pc.ascii_split_whitespace(arr)
 523     expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]])
 524     assert expected.equals(result)
 525
 526     result = pc.ascii_split_whitespace(arr, max_splits=1)
 527     expected = pa.array([["foo", "bar"], ["", "foo  \u3000\tb"]])
 528     assert expected.equals(result)
 529
 530     result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True)
 531     expected = pa.array([["foo", "bar"], [" foo  \u3000", "b"]])
 532     assert expected.equals(result)
 533
 534
 535 def test_split_pattern_regex():
 536     arr = pa.array(["-foo---bar--", "---foo---b"])
 537     result = pc.split_pattern_regex(arr, pattern="-+")
 538     expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]])
 539     assert expected.equals(result)
 540
 541     result = pc.split_pattern_regex(arr, pattern="-+", max_splits=1)
 542     expected = pa.array([["", "foo---bar--"], ["", "foo---b"]])
 543     assert expected.equals(result)
 544
 545     with pytest.raises(NotImplementedError,
 546                        match="Cannot split in reverse with regex"):
 547         result = pc.split_pattern_regex(
 548             arr, pattern="---", max_splits=1, reverse=True)
 549
 550
 551 def test_min_max():
 552     # An example generated function wrapper with possible options
 553     data = [4, 5, 6, None, 1]
 554     s = pc.min_max(data)
 555     assert s.as_py() == {'min': 1, 'max': 6}
 556     s = pc.min_max(data, options=pc.ScalarAggregateOptions())
 557     assert s.as_py() == {'min': 1, 'max': 6}
 558     s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True))
 559     assert s.as_py() == {'min': 1, 'max': 6}
 560     s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False))
 561     assert s.as_py() == {'min': None, 'max': None}
 562
 563     # Options as dict of kwargs
 564     s = pc.min_max(data, options={'skip_nulls': False})
 565     assert s.as_py() == {'min': None, 'max': None}
 566     # Options as named functions arguments
 567     s = pc.min_max(data, skip_nulls=False)
 568     assert s.as_py() == {'min': None, 'max': None}
 569
 570     # Both options and named arguments
 571     with pytest.raises(TypeError):
 572         s = pc.min_max(
 573             data, options=pc.ScalarAggregateOptions(), skip_nulls=False)
 574
 575     # Wrong options type
 576     options = pc.TakeOptions()
 577     with pytest.raises(TypeError):
 578         s = pc.min_max(data, options=options)
 579
 580     # Missing argument
 581     with pytest.raises(ValueError,
 582                        match="Function min_max accepts 1 argument"):
 583         s = pc.min_max()
 584
 585
 586 def test_any():
 587     # ARROW-1846
 588
 589     options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
 590
 591     a = pa.array([], type='bool')
 592     assert pc.any(a).as_py() is None
 593     assert pc.any(a, min_count=0).as_py() is False
 594     assert pc.any(a, options=options).as_py() is False
 595
 596     a = pa.array([False, None, True])
 597     assert pc.any(a).as_py() is True
 598     assert pc.any(a, options=options).as_py() is True
 599
 600     a = pa.array([False, None, False])
 601     assert pc.any(a).as_py() is False
 602     assert pc.any(a, options=options).as_py() is None
 603
 604
 605 def test_all():
 606     # ARROW-10301
 607
 608     options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
 609
 610     a = pa.array([], type='bool')
 611     assert pc.all(a).as_py() is None
 612     assert pc.all(a, min_count=0).as_py() is True
 613     assert pc.all(a, options=options).as_py() is True
 614
 615     a = pa.array([False, True])
 616     assert pc.all(a).as_py() is False
 617     assert pc.all(a, options=options).as_py() is False
 618
 619     a = pa.array([True, None])
 620     assert pc.all(a).as_py() is True
 621     assert pc.all(a, options=options).as_py() is None
 622
 623     a = pa.chunked_array([[True], [True, None]])
 624     assert pc.all(a).as_py() is True
 625     assert pc.all(a, options=options).as_py() is None
 626
 627     a = pa.chunked_array([[True], [False]])
 628     assert pc.all(a).as_py() is False
 629     assert pc.all(a, options=options).as_py() is False
 630
 631
 632 def test_is_valid():
 633     # An example generated function wrapper without options
 634     data = [4, 5, None]
 635     assert pc.is_valid(data).to_pylist() == [True, True, False]
 636
 637     with pytest.raises(TypeError):
 638         pc.is_valid(data, options=None)
 639
 640
 641 def test_generated_docstrings():
 642     assert pc.min_max.__doc__ == textwrap.dedent("""\
 643         Compute the minimum and maximum values of a numeric array.
 644
 645         Null values are ignored by default.
 646         This can be changed through ScalarAggregateOptions.
 647
 648         Parameters
 649         ----------
 650         array : Array-like
 651             Argument to compute function
 652         memory_pool : pyarrow.MemoryPool, optional
 653             If not passed, will allocate memory from the default memory pool.
 654         options : pyarrow.compute.ScalarAggregateOptions, optional
 655             Parameters altering compute function semantics.
 656         skip_nulls : optional
 657             Parameter for ScalarAggregateOptions constructor. Either `options`
 658             or `skip_nulls` can be passed, but not both at the same time.
 659         min_count : optional
 660             Parameter for ScalarAggregateOptions constructor. Either `options`
 661             or `min_count` can be passed, but not both at the same time.
 662         """)
 663     assert pc.add.__doc__ == textwrap.dedent("""\
 664         Add the arguments element-wise.
 665
 666         Results will wrap around on integer overflow.
 667         Use function "add_checked" if you want overflow
 668         to return an error.
 669
 670         Parameters
 671         ----------
 672         x : Array-like or scalar-like
 673             Argument to compute function
 674         y : Array-like or scalar-like
 675             Argument to compute function
 676         memory_pool : pyarrow.MemoryPool, optional
 677             If not passed, will allocate memory from the default memory pool.
 678         """)
 679
 680
 681 def test_generated_signatures():
 682     # The self-documentation provided by signatures should show acceptable
 683     # options and their default values.
 684     sig = inspect.signature(pc.add)
 685     assert str(sig) == "(x, y, *, memory_pool=None)"
 686     sig = inspect.signature(pc.min_max)
 687     assert str(sig) == ("(array, *, memory_pool=None, "
 688                         "options=None, skip_nulls=True, min_count=1)")
 689     sig = inspect.signature(pc.quantile)
 690     assert str(sig) == ("(array, *, memory_pool=None, "
 691                         "options=None, q=0.5, interpolation='linear', "
 692                         "skip_nulls=True, min_count=0)")
 693     sig = inspect.signature(pc.binary_join_element_wise)
 694     assert str(sig) == ("(*strings, memory_pool=None, options=None, "
 695                         "null_handling='emit_null', null_replacement='')")
 696
 697
 698 # We use isprintable to find about codepoints that Python doesn't know, but
 699 # utf8proc does (or in a future version of Python the other way around).
 700 # These codepoints cannot be compared between Arrow and the Python
 701 # implementation.
 702 @lru_cache()
 703 def find_new_unicode_codepoints():
 704     new = set()
 705     characters = [chr(c) for c in range(0x80, 0x11000)
 706                   if not (0xD800 <= c < 0xE000)]
 707     is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist()
 708     for i, c in enumerate(characters):
 709         if is_printable[i] != c.isprintable():
 710             new.add(ord(c))
 711     return new
 712
 713
 714 # Python claims there are not alpha, not sure why, they are in
 715 #  gc='Other Letter': https://graphemica.com/%E1%B3%B2
 716 unknown_issue_is_alpha = {0x1cf2, 0x1cf3}
 717 # utf8proc does not know if codepoints are lower case
 718 utf8proc_issue_is_lower = {
 719     0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
 720     0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
 721     0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
 722     0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33,
 723     0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39,
 724     0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f,
 725     0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45,
 726     0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b,
 727     0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51,
 728     0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57,
 729     0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d,
 730     0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63,
 731     0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69,
 732     0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e,
 733     0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4,
 734     0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa,
 735     0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0,
 736     0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6,
 737     0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc,
 738     0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090,
 739     0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096,
 740     0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c,
 741     0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8,
 742     0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, }
 743 # utf8proc does not store if a codepoint is numeric
 744 numeric_info_missing = {
 745     0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
 746     0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
 747     0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
 748     0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
 749     0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
 750     0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
 751     0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
 752     0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
 753     0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
 754     0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
 755     0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5,
 756     0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca,
 757     0x10fcb, }
 758 # utf8proc has no no digit/numeric information
 759 digit_info_missing = {
 760     0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
 761     0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070,
 762     0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080,
 763     0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
 764     0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464,
 765     0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476,
 766     0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488,
 767     0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f,
 768     0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9,
 769     0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777,
 770     0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e,
 771     0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786,
 772     0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e,
 773     0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41,
 774     0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63,
 775     0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, }
 776 numeric_info_missing = {
 777     0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
 778     0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
 779     0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
 780     0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
 781     0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
 782     0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
 783     0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
 784     0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
 785     0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
 786     0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
 787     0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }
 788
 789 codepoints_ignore = {
 790     'is_alnum': numeric_info_missing | digit_info_missing |
 791     unknown_issue_is_alpha,
 792     'is_alpha': unknown_issue_is_alpha,
 793     'is_digit': digit_info_missing,
 794     'is_numeric': numeric_info_missing,
 795     'is_lower': utf8proc_issue_is_lower
 796 }
 797
 798
 799 @pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha',
 800                                            'is_ascii', 'is_decimal',
 801                                            'is_digit', 'is_lower',
 802                                            'is_numeric', 'is_printable',
 803                                            'is_space', 'is_upper', ])
 804 @pytest.mark.parametrize('variant', ['ascii', 'utf8'])
 805 def test_string_py_compat_boolean(function_name, variant):
 806     arrow_name = variant + "_" + function_name
 807     py_name = function_name.replace('_', '')
 808     ignore = codepoints_ignore.get(function_name, set()) | \
 809         find_new_unicode_codepoints()
 810     for i in range(128 if ascii else 0x11000):
 811         if i in range(0xD800, 0xE000):
 812             continue  # bug? pyarrow doesn't allow utf16 surrogates
 813         # the issues we know of, we skip
 814         if i in ignore:
 815             continue
 816         # Compare results with the equivalent Python predicate
 817         # (except "is_space" where functions are known to be incompatible)
 818         c = chr(i)
 819         if hasattr(pc, arrow_name) and function_name != 'is_space':
 820             ar = pa.array([c])
 821             arrow_func = getattr(pc, arrow_name)
 822             assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()
 823
 824
 825 def test_pad():
 826     arr = pa.array([None, 'a', 'abcd'])
 827     assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd']
 828     assert pc.ascii_lpad(arr, width=3).tolist() == [None, '  a', 'abcd']
 829     assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a  ', 'abcd']
 830
 831     arr = pa.array([None, 'á', 'abcd'])
 832     assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd']
 833     assert pc.utf8_lpad(arr, width=3).tolist() == [None, '  á', 'abcd']
 834     assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á  ', 'abcd']
 835
 836
 837 @pytest.mark.pandas
 838 def test_replace_slice():
 839     offsets = range(-3, 4)
 840
 841     arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde'])
 842     series = arr.to_pandas()
 843     for start in offsets:
 844         for stop in offsets:
 845             expected = series.str.slice_replace(start, stop, 'XX')
 846             actual = pc.binary_replace_slice(
 847                 arr, start=start, stop=stop, replacement='XX')
 848             assert actual.tolist() == expected.tolist()
 849
 850     arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde'])
 851     series = arr.to_pandas()
 852     for start in offsets:
 853         for stop in offsets:
 854             expected = series.str.slice_replace(start, stop, 'XX')
 855             actual = pc.utf8_replace_slice(
 856                 arr, start=start, stop=stop, replacement='XX')
 857             assert actual.tolist() == expected.tolist()
 858
 859
 860 def test_replace_plain():
 861     ar = pa.array(['foo', 'food', None])
 862     ar = pc.replace_substring(ar, pattern='foo', replacement='bar')
 863     assert ar.tolist() == ['bar', 'bard', None]
 864
 865
 866 def test_replace_regex():
 867     ar = pa.array(['foo', 'mood', None])
 868     ar = pc.replace_substring_regex(ar, pattern='(.)oo', replacement=r'\100')
 869     assert ar.tolist() == ['f00', 'm00d', None]
 870
 871
 872 def test_extract_regex():
 873     ar = pa.array(['a1', 'zb2z'])
 874     struct = pc.extract_regex(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d)')
 875     assert struct.tolist() == [{'letter': 'a', 'digit': '1'}, {
 876         'letter': 'b', 'digit': '2'}]
 877
 878
 879 def test_binary_join():
 880     ar_list = pa.array([['foo', 'bar'], None, []])
 881     expected = pa.array(['foo-bar', None, ''])
 882     assert pc.binary_join(ar_list, '-').equals(expected)
 883
 884     separator_array = pa.array(['1', '2'], type=pa.binary())
 885     expected = pa.array(['a1b', 'c2d'], type=pa.binary())
 886     ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary()))
 887     assert pc.binary_join(ar_list, separator_array).equals(expected)
 888
 889
 890 def test_binary_join_element_wise():
 891     null = pa.scalar(None, type=pa.string())
 892     arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']]
 893     assert pc.binary_join_element_wise(*arrs).to_pylist() == \
 894         [None, None, 'b--d']
 895     assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b'
 896     assert pc.binary_join_element_wise('a', null, '-').as_py() is None
 897     assert pc.binary_join_element_wise('a', 'b', null).as_py() is None
 898
 899     skip = pc.JoinOptions(null_handling='skip')
 900     assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \
 901         [None, 'a', 'b--d']
 902     assert pc.binary_join_element_wise(
 903         'a', 'b', '-', options=skip).as_py() == 'a-b'
 904     assert pc.binary_join_element_wise(
 905         'a', null, '-', options=skip).as_py() == 'a'
 906     assert pc.binary_join_element_wise(
 907         'a', 'b', null, options=skip).as_py() is None
 908
 909     replace = pc.JoinOptions(null_handling='replace', null_replacement='spam')
 910     assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \
 911         [None, 'a-spam', 'b--d']
 912     assert pc.binary_join_element_wise(
 913         'a', 'b', '-', options=replace).as_py() == 'a-b'
 914     assert pc.binary_join_element_wise(
 915         'a', null, '-', options=replace).as_py() == 'a-spam'
 916     assert pc.binary_join_element_wise(
 917         'a', 'b', null, options=replace).as_py() is None
 918
 919
 920 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 921 def test_take(ty, values):
 922     arr = pa.array(values, type=ty)
 923     for indices_type in [pa.int8(), pa.int64()]:
 924         indices = pa.array([0, 4, 2, None], type=indices_type)
 925         result = arr.take(indices)
 926         result.validate()
 927         expected = pa.array([values[0], values[4], values[2], None], type=ty)
 928         assert result.equals(expected)
 929
 930         # empty indices
 931         indices = pa.array([], type=indices_type)
 932         result = arr.take(indices)
 933         result.validate()
 934         expected = pa.array([], type=ty)
 935         assert result.equals(expected)
 936
 937     indices = pa.array([2, 5])
 938     with pytest.raises(IndexError):
 939         arr.take(indices)
 940
 941     indices = pa.array([2, -1])
 942     with pytest.raises(IndexError):
 943         arr.take(indices)
 944
 945
 946 def test_take_indices_types():
 947     arr = pa.array(range(5))
 948
 949     for indices_type in ['uint8', 'int8', 'uint16', 'int16',
 950                          'uint32', 'int32', 'uint64', 'int64']:
 951         indices = pa.array([0, 4, 2, None], type=indices_type)
 952         result = arr.take(indices)
 953         result.validate()
 954         expected = pa.array([0, 4, 2, None])
 955         assert result.equals(expected)
 956
 957     for indices_type in [pa.float32(), pa.float64()]:
 958         indices = pa.array([0, 4, 2], type=indices_type)
 959         with pytest.raises(NotImplementedError):
 960             arr.take(indices)
 961
 962
 963 def test_take_on_chunked_array():
 964     # ARROW-9504
 965     arr = pa.chunked_array([
 966         [
 967             "a",
 968             "b",
 969             "c",
 970             "d",
 971             "e"
 972         ],
 973         [
 974             "f",
 975             "g",
 976             "h",
 977             "i",
 978             "j"
 979         ]
 980     ])
 981
 982     indices = np.array([0, 5, 1, 6, 9, 2])
 983     result = arr.take(indices)
 984     expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
 985     assert result.equals(expected)
 986
 987     indices = pa.chunked_array([[1], [9, 2]])
 988     result = arr.take(indices)
 989     expected = pa.chunked_array([
 990         [
 991             "b"
 992         ],
 993         [
 994             "j",
 995             "c"
 996         ]
 997     ])
 998     assert result.equals(expected)
 999
1000
1001 @pytest.mark.parametrize('ordered', [False, True])
1002 def test_take_dictionary(ordered):
1003     arr = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
1004                                          ordered=ordered)
1005     result = arr.take(pa.array([0, 1, 3]))
1006     result.validate()
1007     assert result.to_pylist() == ['a', 'b', 'a']
1008     assert result.dictionary.to_pylist() == ['a', 'b', 'c']
1009     assert result.type.ordered is ordered
1010
1011
1012 def test_take_null_type():
1013     # ARROW-10027
1014     arr = pa.array([None] * 10)
1015     chunked_arr = pa.chunked_array([[None] * 5] * 2)
1016     batch = pa.record_batch([arr], names=['a'])
1017     table = pa.table({'a': arr})
1018
1019     indices = pa.array([1, 3, 7, None])
1020     assert len(arr.take(indices)) == 4
1021     assert len(chunked_arr.take(indices)) == 4
1022     assert len(batch.take(indices).column(0)) == 4
1023     assert len(table.take(indices).column(0)) == 4
1024
1025
1026 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
1027 def test_drop_null(ty, values):
1028     arr = pa.array(values, type=ty)
1029     result = arr.drop_null()
1030     result.validate(full=True)
1031     indices = [i for i in range(len(arr)) if arr[i].is_valid]
1032     expected = arr.take(pa.array(indices))
1033     assert result.equals(expected)
1034
1035
1036 def test_drop_null_chunked_array():
1037     arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []])
1038     expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []])
1039
1040     result = arr.drop_null()
1041     assert result.equals(expected_drop)
1042
1043
1044 def test_drop_null_record_batch():
1045     batch = pa.record_batch(
1046         [pa.array(["a", None, "c", "d", None])], names=["a'"])
1047     result = batch.drop_null()
1048     expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
1049     assert result.equals(expected)
1050
1051     batch = pa.record_batch(
1052         [pa.array(["a", None, "c", "d", None]),
1053          pa.array([None, None, "c", None, "e"])], names=["a'", "b'"])
1054
1055     result = batch.drop_null()
1056     expected = pa.record_batch(
1057         [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"])
1058     assert result.equals(expected)
1059
1060
1061 def test_drop_null_table():
1062     table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"])
1063     expected = pa.table([pa.array(["a", "c", "d"])], names=["a"])
1064     result = table.drop_null()
1065     assert result.equals(expected)
1066
1067     table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]),
1068                       pa.chunked_array([["a", None], [None, "d", None]]),
1069                       pa.chunked_array([["a"], ["b"], [None], ["d", None]])],
1070                      names=["a", "b", "c"])
1071     expected = pa.table([pa.array(["a", "d"]),
1072                          pa.array(["a", "d"]),
1073                          pa.array(["a", "d"])],
1074                         names=["a", "b", "c"])
1075     result = table.drop_null()
1076     assert result.equals(expected)
1077
1078     table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]),
1079                       pa.chunked_array([["A"], ["B"], [None], ["D", None]]),
1080                       pa.chunked_array([["a`", None], ["c`", "d`", None]])],
1081                      names=["a", "b", "c"])
1082     expected = pa.table([pa.array(["a", "d"]),
1083                          pa.array(["A", "D"]),
1084                          pa.array(["a`", "d`"])],
1085                         names=["a", "b", "c"])
1086     result = table.drop_null()
1087     assert result.equals(expected)
1088
1089
1090 def test_drop_null_null_type():
1091     arr = pa.array([None] * 10)
1092     chunked_arr = pa.chunked_array([[None] * 5] * 2)
1093     batch = pa.record_batch([arr], names=['a'])
1094     table = pa.table({'a': arr})
1095
1096     assert len(arr.drop_null()) == 0
1097     assert len(chunked_arr.drop_null()) == 0
1098     assert len(batch.drop_null().column(0)) == 0
1099     assert len(table.drop_null().column(0)) == 0
1100
1101
1102 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
1103 def test_filter(ty, values):
1104     arr = pa.array(values, type=ty)
1105
1106     mask = pa.array([True, False, False, True, None])
1107     result = arr.filter(mask, null_selection_behavior='drop')
1108     result.validate()
1109     assert result.equals(pa.array([values[0], values[3]], type=ty))
1110     result = arr.filter(mask, null_selection_behavior='emit_null')
1111     result.validate()
1112     assert result.equals(pa.array([values[0], values[3], None], type=ty))
1113
1114     # non-boolean dtype
1115     mask = pa.array([0, 1, 0, 1, 0])
1116     with pytest.raises(NotImplementedError):
1117         arr.filter(mask)
1118
1119     # wrong length
1120     mask = pa.array([True, False, True])
1121     with pytest.raises(ValueError, match="must all be the same length"):
1122         arr.filter(mask)
1123
1124
1125 def test_filter_chunked_array():
1126     arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
1127     expected_drop = pa.chunked_array([["a"], ["e"]])
1128     expected_null = pa.chunked_array([["a"], [None, "e"]])
1129
1130     for mask in [
1131         # mask is array
1132         pa.array([True, False, None, False, True]),
1133         # mask is chunked array
1134         pa.chunked_array([[True, False, None], [False, True]]),
1135         # mask is python object
1136         [True, False, None, False, True]
1137     ]:
1138         result = arr.filter(mask)
1139         assert result.equals(expected_drop)
1140         result = arr.filter(mask, null_selection_behavior="emit_null")
1141         assert result.equals(expected_null)
1142
1143
1144 def test_filter_record_batch():
1145     batch = pa.record_batch(
1146         [pa.array(["a", None, "c", "d", "e"])], names=["a'"])
1147
1148     # mask is array
1149     mask = pa.array([True, False, None, False, True])
1150     result = batch.filter(mask)
1151     expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"])
1152     assert result.equals(expected)
1153
1154     result = batch.filter(mask, null_selection_behavior="emit_null")
1155     expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"])
1156     assert result.equals(expected)
1157
1158
1159 def test_filter_table():
1160     table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
1161     expected_drop = pa.table([pa.array(["a", "e"])], names=["a"])
1162     expected_null = pa.table([pa.array(["a", None, "e"])], names=["a"])
1163
1164     for mask in [
1165         # mask is array
1166         pa.array([True, False, None, False, True]),
1167         # mask is chunked array
1168         pa.chunked_array([[True, False], [None, False, True]]),
1169         # mask is python object
1170         [True, False, None, False, True]
1171     ]:
1172         result = table.filter(mask)
1173         assert result.equals(expected_drop)
1174         result = table.filter(mask, null_selection_behavior="emit_null")
1175         assert result.equals(expected_null)
1176
1177
1178 def test_filter_errors():
1179     arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
1180     batch = pa.record_batch(
1181         [pa.array(["a", None, "c", "d", "e"])], names=["a'"])
1182     table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
1183
1184     for obj in [arr, batch, table]:
1185         # non-boolean dtype
1186         mask = pa.array([0, 1, 0, 1, 0])
1187         with pytest.raises(NotImplementedError):
1188             obj.filter(mask)
1189
1190         # wrong length
1191         mask = pa.array([True, False, True])
1192         with pytest.raises(pa.ArrowInvalid,
1193                            match="must all be the same length"):
1194             obj.filter(mask)
1195
1196
1197 def test_filter_null_type():
1198     # ARROW-10027
1199     arr = pa.array([None] * 10)
1200     chunked_arr = pa.chunked_array([[None] * 5] * 2)
1201     batch = pa.record_batch([arr], names=['a'])
1202     table = pa.table({'a': arr})
1203
1204     mask = pa.array([True, False] * 5)
1205     assert len(arr.filter(mask)) == 5
1206     assert len(chunked_arr.filter(mask)) == 5
1207     assert len(batch.filter(mask).column(0)) == 5
1208     assert len(table.filter(mask).column(0)) == 5
1209
1210
1211 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1212 def test_compare_array(typ):
1213     if typ == "array":
1214         def con(values):
1215             return pa.array(values)
1216     else:
1217         def con(values):
1218             return pa.chunked_array([values])
1219
1220     arr1 = con([1, 2, 3, 4, None])
1221     arr2 = con([1, 1, 4, None, 4])
1222
1223     result = pc.equal(arr1, arr2)
1224     assert result.equals(con([True, False, False, None, None]))
1225
1226     result = pc.not_equal(arr1, arr2)
1227     assert result.equals(con([False, True, True, None, None]))
1228
1229     result = pc.less(arr1, arr2)
1230     assert result.equals(con([False, False, True, None, None]))
1231
1232     result = pc.less_equal(arr1, arr2)
1233     assert result.equals(con([True, False, True, None, None]))
1234
1235     result = pc.greater(arr1, arr2)
1236     assert result.equals(con([False, True, False, None, None]))
1237
1238     result = pc.greater_equal(arr1, arr2)
1239     assert result.equals(con([True, True, False, None, None]))
1240
1241
1242 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1243 def test_compare_string_scalar(typ):
1244     if typ == "array":
1245         def con(values):
1246             return pa.array(values)
1247     else:
1248         def con(values):
1249             return pa.chunked_array([values])
1250
1251     arr = con(['a', 'b', 'c', None])
1252     scalar = pa.scalar('b')
1253
1254     result = pc.equal(arr, scalar)
1255     assert result.equals(con([False, True, False, None]))
1256
1257     if typ == "array":
1258         nascalar = pa.scalar(None, type="string")
1259         result = pc.equal(arr, nascalar)
1260         isnull = pc.is_null(result)
1261         assert isnull.equals(con([True, True, True, True]))
1262
1263     result = pc.not_equal(arr, scalar)
1264     assert result.equals(con([True, False, True, None]))
1265
1266     result = pc.less(arr, scalar)
1267     assert result.equals(con([True, False, False, None]))
1268
1269     result = pc.less_equal(arr, scalar)
1270     assert result.equals(con([True, True, False, None]))
1271
1272     result = pc.greater(arr, scalar)
1273     assert result.equals(con([False, False, True, None]))
1274
1275     result = pc.greater_equal(arr, scalar)
1276     assert result.equals(con([False, True, True, None]))
1277
1278
1279 @pytest.mark.parametrize("typ", ["array", "chunked_array"])
1280 def test_compare_scalar(typ):
1281     if typ == "array":
1282         def con(values):
1283             return pa.array(values)
1284     else:
1285         def con(values):
1286             return pa.chunked_array([values])
1287
1288     arr = con([1, 2, 3, None])
1289     scalar = pa.scalar(2)
1290
1291     result = pc.equal(arr, scalar)
1292     assert result.equals(con([False, True, False, None]))
1293
1294     if typ == "array":
1295         nascalar = pa.scalar(None, type="int64")
1296         result = pc.equal(arr, nascalar)
1297         assert result.to_pylist() == [None, None, None, None]
1298
1299     result = pc.not_equal(arr, scalar)
1300     assert result.equals(con([True, False, True, None]))
1301
1302     result = pc.less(arr, scalar)
1303     assert result.equals(con([True, False, False, None]))
1304
1305     result = pc.less_equal(arr, scalar)
1306     assert result.equals(con([True, True, False, None]))
1307
1308     result = pc.greater(arr, scalar)
1309     assert result.equals(con([False, False, True, None]))
1310
1311     result = pc.greater_equal(arr, scalar)
1312     assert result.equals(con([False, True, True, None]))
1313
1314
1315 def test_compare_chunked_array_mixed():
1316     arr = pa.array([1, 2, 3, 4, None])
1317     arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]])
1318     arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]])
1319
1320     expected = pa.chunked_array([[True, True, True, True, None]])
1321
1322     for left, right in [
1323         (arr, arr_chunked),
1324         (arr_chunked, arr),
1325         (arr_chunked, arr_chunked2),
1326     ]:
1327         result = pc.equal(left, right)
1328         assert result.equals(expected)
1329
1330
1331 def test_arithmetic_add():
1332     left = pa.array([1, 2, 3, 4, 5])
1333     right = pa.array([0, -1, 1, 2, 3])
1334     result = pc.add(left, right)
1335     expected = pa.array([1, 1, 4, 6, 8])
1336     assert result.equals(expected)
1337
1338
1339 def test_arithmetic_subtract():
1340     left = pa.array([1, 2, 3, 4, 5])
1341     right = pa.array([0, -1, 1, 2, 3])
1342     result = pc.subtract(left, right)
1343     expected = pa.array([1, 3, 2, 2, 2])
1344     assert result.equals(expected)
1345
1346
1347 def test_arithmetic_multiply():
1348     left = pa.array([1, 2, 3, 4, 5])
1349     right = pa.array([0, -1, 1, 2, 3])
1350     result = pc.multiply(left, right)
1351     expected = pa.array([0, -2, 3, 8, 15])
1352     assert result.equals(expected)
1353
1354
1355 @pytest.mark.parametrize("ty", ["round", "round_to_multiple"])
1356 def test_round_to_integer(ty):
1357     if ty == "round":
1358         round = pc.round
1359         RoundOptions = partial(pc.RoundOptions, ndigits=0)
1360     elif ty == "round_to_multiple":
1361         round = pc.round_to_multiple
1362         RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1)
1363
1364     values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None]
1365     rmode_and_expected = {
1366         "down": [3, 3, 3, 4, -4, -4, -4, None],
1367         "up": [4, 4, 4, 5, -3, -3, -3, None],
1368         "towards_zero": [3, 3, 3, 4, -3, -3, -3, None],
1369         "towards_infinity": [4, 4, 4, 5, -4, -4, -4, None],
1370         "half_down": [3, 3, 4, 4, -3, -4, -4, None],
1371         "half_up": [3, 4, 4, 5, -3, -3, -4, None],
1372         "half_towards_zero": [3, 3, 4, 4, -3, -3, -4, None],
1373         "half_towards_infinity": [3, 4, 4, 5, -3, -4, -4, None],
1374         "half_to_even": [3, 4, 4, 4, -3, -4, -4, None],
1375         "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None],
1376     }
1377     for round_mode, expected in rmode_and_expected.items():
1378         options = RoundOptions(round_mode=round_mode)
1379         result = round(values, options=options)
1380         np.testing.assert_array_equal(result, pa.array(expected))
1381
1382
1383 def test_round():
1384     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
1385     ndigits_and_expected = {
1386         -2: [300, 0, 0, 0, -0, -0, -0, None],
1387         -1: [320, 0, 0, 0, -0, -40, -0, None],
1388         0: [320, 4, 3, 5, -3, -35, -3, None],
1389         1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
1390         2: [320, 3.5, 3.08, 4.5, -3.21, -35.12, -3.05, None],
1391     }
1392     for ndigits, expected in ndigits_and_expected.items():
1393         options = pc.RoundOptions(ndigits, "half_towards_infinity")
1394         result = pc.round(values, options=options)
1395         np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
1396
1397
1398 def test_round_to_multiple():
1399     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
1400     multiple_and_expected = {
1401         2: [320, 4, 4, 4, -4, -36, -4, None],
1402         0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None],
1403         0.1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
1404         10: [320, 0, 0, 0, -0, -40, -0, None],
1405         100: [300, 0, 0, 0, -0, -0, -0, None],
1406     }
1407     for multiple, expected in multiple_and_expected.items():
1408         options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity")
1409         result = pc.round_to_multiple(values, options=options)
1410         np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
1411
1412     with pytest.raises(pa.ArrowInvalid, match="multiple must be positive"):
1413         pc.round_to_multiple(values, multiple=-2)
1414
1415
1416 def test_is_null():
1417     arr = pa.array([1, 2, 3, None])
1418     result = arr.is_null()
1419     expected = pa.array([False, False, False, True])
1420     assert result.equals(expected)
1421     assert result.equals(pc.is_null(arr))
1422     result = arr.is_valid()
1423     expected = pa.array([True, True, True, False])
1424     assert result.equals(expected)
1425     assert result.equals(pc.is_valid(arr))
1426
1427     arr = pa.chunked_array([[1, 2], [3, None]])
1428     result = arr.is_null()
1429     expected = pa.chunked_array([[False, False], [False, True]])
1430     assert result.equals(expected)
1431     result = arr.is_valid()
1432     expected = pa.chunked_array([[True, True], [True, False]])
1433     assert result.equals(expected)
1434
1435     arr = pa.array([1, 2, 3, None, np.nan])
1436     result = arr.is_null()
1437     expected = pa.array([False, False, False, True, False])
1438     assert result.equals(expected)
1439
1440     result = arr.is_null(nan_is_null=True)
1441     expected = pa.array([False, False, False, True, True])
1442     assert result.equals(expected)
1443
1444
1445 def test_fill_null():
1446     arr = pa.array([1, 2, None, 4], type=pa.int8())
1447     fill_value = pa.array([5], type=pa.int8())
1448     with pytest.raises(pa.ArrowInvalid,
1449                        match="Array arguments must all be the same length"):
1450         arr.fill_null(fill_value)
1451
1452     arr = pa.array([None, None, None, None], type=pa.null())
1453     fill_value = pa.scalar(None, type=pa.null())
1454     result = arr.fill_null(fill_value)
1455     expected = pa.array([None, None, None, None])
1456     assert result.equals(expected)
1457
1458     arr = pa.array(['a', 'bb', None])
1459     result = arr.fill_null('ccc')
1460     expected = pa.array(['a', 'bb', 'ccc'])
1461     assert result.equals(expected)
1462
1463     arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
1464     result = arr.fill_null('ccc')
1465     expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
1466     assert result.equals(expected)
1467
1468     arr = pa.array(['a', 'bb', None])
1469     result = arr.fill_null(None)
1470     expected = pa.array(['a', 'bb', None])
1471     assert result.equals(expected)
1472
1473
1474 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
1475 def test_fill_null_array(arrow_type):
1476     arr = pa.array([1, 2, None, 4], type=arrow_type)
1477     fill_value = pa.scalar(5, type=arrow_type)
1478     result = arr.fill_null(fill_value)
1479     expected = pa.array([1, 2, 5, 4], type=arrow_type)
1480     assert result.equals(expected)
1481
1482     # Implicit conversions
1483     result = arr.fill_null(5)
1484     assert result.equals(expected)
1485
1486     # ARROW-9451: Unsigned integers allow this for some reason
1487     if not pa.types.is_unsigned_integer(arr.type):
1488         with pytest.raises((ValueError, TypeError)):
1489             arr.fill_null('5')
1490
1491     result = arr.fill_null(pa.scalar(5, type='int8'))
1492     assert result.equals(expected)
1493
1494
1495 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
1496 def test_fill_null_chunked_array(arrow_type):
1497     fill_value = pa.scalar(5, type=arrow_type)
1498     arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)])
1499     result = arr.fill_null(fill_value)
1500     expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)])
1501     assert result.equals(expected)
1502
1503     arr = pa.chunked_array([
1504         pa.array([1, 2], type=arrow_type),
1505         pa.array([], type=arrow_type),
1506         pa.array([None, 4], type=arrow_type)
1507     ])
1508     expected = pa.chunked_array([
1509         pa.array([1, 2], type=arrow_type),
1510         pa.array([], type=arrow_type),
1511         pa.array([5, 4], type=arrow_type)
1512     ])
1513     result = arr.fill_null(fill_value)
1514     assert result.equals(expected)
1515
1516     # Implicit conversions
1517     result = arr.fill_null(5)
1518     assert result.equals(expected)
1519
1520     result = arr.fill_null(pa.scalar(5, type='int8'))
1521     assert result.equals(expected)
1522
1523
1524 def test_logical():
1525     a = pa.array([True, False, False, None])
1526     b = pa.array([True, True, False, True])
1527
1528     assert pc.and_(a, b) == pa.array([True, False, False, None])
1529     assert pc.and_kleene(a, b) == pa.array([True, False, False, None])
1530
1531     assert pc.or_(a, b) == pa.array([True, True, False, None])
1532     assert pc.or_kleene(a, b) == pa.array([True, True, False, True])
1533
1534     assert pc.xor(a, b) == pa.array([False, True, False, None])
1535
1536     assert pc.invert(a) == pa.array([False, True, True, None])
1537
1538
1539 def test_cast():
1540     arr = pa.array([2 ** 63 - 1], type='int64')
1541
1542     with pytest.raises(pa.ArrowInvalid):
1543         pc.cast(arr, 'int32')
1544
1545     assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32')
1546
1547     arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
1548     expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
1549     assert pc.cast(arr, 'timestamp[ms]') == expected
1550
1551     arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8()))
1552     expected = pa.array([["1", "2"], ["3", "4", "5"]],
1553                         type=pa.list_(pa.utf8()))
1554     assert pc.cast(arr, expected.type) == expected
1555
1556
1557 def test_strptime():
1558     arr = pa.array(["5/1/2020", None, "12/13/1900"])
1559
1560     got = pc.strptime(arr, format='%m/%d/%Y', unit='s')
1561     expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)],
1562                         type=pa.timestamp('s'))
1563     assert got == expected
1564
1565
1566 # TODO: We should test on windows once ARROW-13168 is resolved.
1567 @pytest.mark.pandas
1568 @pytest.mark.skipif(sys.platform == 'win32',
1569                     reason="Timezone database is not available on Windows yet")
1570 def test_strftime():
1571     from pyarrow.vendored.version import Version
1572
1573     def _fix_timestamp(s):
1574         if Version(pd.__version__) < Version("1.0.0"):
1575             return s.to_series().replace("NaT", pd.NaT)
1576         else:
1577             return s
1578
1579     times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
1580     timezones = ["CET", "UTC", "Europe/Ljubljana"]
1581
1582     formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H",
1583                "%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x",
1584                "%X", "%%", "%G", "%V", "%u"]
1585
1586     for timezone in timezones:
1587         ts = pd.to_datetime(times).tz_localize(timezone)
1588         for unit in ["s", "ms", "us", "ns"]:
1589             tsa = pa.array(ts, type=pa.timestamp(unit, timezone))
1590             for fmt in formats:
1591                 options = pc.StrftimeOptions(fmt)
1592                 result = pc.strftime(tsa, options=options)
1593                 expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1594                 assert result.equals(expected)
1595
1596         fmt = "%Y-%m-%dT%H:%M:%S"
1597
1598         # Default format
1599         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1600         result = pc.strftime(tsa, options=pc.StrftimeOptions())
1601         expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1602         assert result.equals(expected)
1603
1604         # Default format plus timezone
1605         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1606         result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
1607         expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
1608         assert result.equals(expected)
1609
1610         # Pandas %S is equivalent to %S in arrow for unit="s"
1611         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1612         options = pc.StrftimeOptions("%S")
1613         result = pc.strftime(tsa, options=options)
1614         expected = pa.array(_fix_timestamp(ts.strftime("%S")))
1615         assert result.equals(expected)
1616
1617         # Pandas %S.%f is equivalent to %S in arrow for unit="us"
1618         tsa = pa.array(ts, type=pa.timestamp("us", timezone))
1619         options = pc.StrftimeOptions("%S")
1620         result = pc.strftime(tsa, options=options)
1621         expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
1622         assert result.equals(expected)
1623
1624         # Test setting locale
1625         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
1626         options = pc.StrftimeOptions(fmt, locale="C")
1627         result = pc.strftime(tsa, options=options)
1628         expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1629         assert result.equals(expected)
1630
1631     # Test timestamps without timezone
1632     fmt = "%Y-%m-%dT%H:%M:%S"
1633     ts = pd.to_datetime(times)
1634     tsa = pa.array(ts, type=pa.timestamp("s"))
1635     result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
1636     expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1637
1638     assert result.equals(expected)
1639     with pytest.raises(pa.ArrowInvalid,
1640                        match="Timezone not present, cannot convert to string"):
1641         pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
1642     with pytest.raises(pa.ArrowInvalid,
1643                        match="Timezone not present, cannot convert to string"):
1644         pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z"))
1645
1646
1647 def _check_datetime_components(timestamps, timezone=None):
1648     from pyarrow.vendored.version import Version
1649
1650     ts = pd.to_datetime(timestamps).tz_localize(
1651         "UTC").tz_convert(timezone).to_series()
1652     tsa = pa.array(ts, pa.timestamp("ns", tz=timezone))
1653
1654     subseconds = ((ts.dt.microsecond * 10 ** 3 +
1655                    ts.dt.nanosecond) * 10 ** -9).round(9)
1656     iso_calendar_fields = [
1657         pa.field('iso_year', pa.int64()),
1658         pa.field('iso_week', pa.int64()),
1659         pa.field('iso_day_of_week', pa.int64())
1660     ]
1661
1662     if Version(pd.__version__) < Version("1.1.0"):
1663         # https://github.com/pandas-dev/pandas/issues/33206
1664         iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64")
1665         iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64")
1666         iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64")
1667     else:
1668         # Casting is required because pandas isocalendar returns int32
1669         # while arrow isocalendar returns int64.
1670         iso_year = ts.dt.isocalendar()["year"].astype("int64")
1671         iso_week = ts.dt.isocalendar()["week"].astype("int64")
1672         iso_day = ts.dt.isocalendar()["day"].astype("int64")
1673
1674     iso_calendar = pa.StructArray.from_arrays(
1675         [iso_year, iso_week, iso_day],
1676         fields=iso_calendar_fields)
1677
1678     assert pc.year(tsa).equals(pa.array(ts.dt.year))
1679     assert pc.month(tsa).equals(pa.array(ts.dt.month))
1680     assert pc.day(tsa).equals(pa.array(ts.dt.day))
1681     assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek))
1682     assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear))
1683     assert pc.iso_year(tsa).equals(pa.array(iso_year))
1684     assert pc.iso_week(tsa).equals(pa.array(iso_week))
1685     assert pc.iso_calendar(tsa).equals(iso_calendar)
1686     assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter))
1687     assert pc.hour(tsa).equals(pa.array(ts.dt.hour))
1688     assert pc.minute(tsa).equals(pa.array(ts.dt.minute))
1689     assert pc.second(tsa).equals(pa.array(ts.dt.second.values))
1690     assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3))
1691     assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3))
1692     assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond))
1693     assert pc.subsecond(tsa).equals(pa.array(subseconds))
1694
1695     day_of_week_options = pc.DayOfWeekOptions(
1696         count_from_zero=False, week_start=1)
1697     assert pc.day_of_week(tsa, options=day_of_week_options).equals(
1698         pa.array(ts.dt.dayofweek + 1))
1699
1700     week_options = pc.WeekOptions(
1701         week_starts_monday=True, count_from_zero=False,
1702         first_week_is_fully_in_year=False)
1703     assert pc.week(tsa, options=week_options).equals(pa.array(iso_week))
1704
1705
1706 @pytest.mark.pandas
1707 def test_extract_datetime_components():
1708     from pyarrow.vendored.version import Version
1709
1710     timestamps = ["1970-01-01T00:00:59.123456789",
1711                   "2000-02-29T23:23:23.999999999",
1712                   "2033-05-18T03:33:20.000000000",
1713                   "2020-01-01T01:05:05.001",
1714                   "2019-12-31T02:10:10.002",
1715                   "2019-12-30T03:15:15.003",
1716                   "2009-12-31T04:20:20.004132",
1717                   "2010-01-01T05:25:25.005321",
1718                   "2010-01-03T06:30:30.006163",
1719                   "2010-01-04T07:35:35",
1720                   "2006-01-01T08:40:40",
1721                   "2005-12-31T09:45:45",
1722                   "2008-12-28",
1723                   "2008-12-29",
1724                   "2012-01-01 01:02:03"]
1725     timezones = ["UTC", "US/Central", "Asia/Kolkata",
1726                  "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"]
1727
1728     # Test timezone naive timestamp array
1729     _check_datetime_components(timestamps)
1730
1731     # Test timezone aware timestamp array
1732     if sys.platform == 'win32':
1733         # TODO: We should test on windows once ARROW-13168 is resolved.
1734         pytest.skip('Timezone database is not available on Windows yet')
1735     elif Version(pd.__version__) < Version('1.0.0'):
1736         pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
1737     else:
1738         for timezone in timezones:
1739             _check_datetime_components(timestamps, timezone)
1740
1741
1742 # TODO: We should test on windows once ARROW-13168 is resolved.
1743 @pytest.mark.pandas
1744 @pytest.mark.skipif(sys.platform == 'win32',
1745                     reason="Timezone database is not available on Windows yet")
1746 def test_assume_timezone():
1747     from pyarrow.vendored.version import Version
1748
1749     ts_type = pa.timestamp("ns")
1750     timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
1751                                  "2000-02-29T23:23:23.999999999",
1752                                  "2033-05-18T03:33:20.000000000",
1753                                  "2020-01-01T01:05:05.001",
1754                                  "2019-12-31T02:10:10.002",
1755                                  "2019-12-30T03:15:15.003",
1756                                  "2009-12-31T04:20:20.004132",
1757                                  "2010-01-01T05:25:25.005321",
1758                                  "2010-01-03T06:30:30.006163",
1759                                  "2010-01-04T07:35:35",
1760                                  "2006-01-01T08:40:40",
1761                                  "2005-12-31T09:45:45",
1762                                  "2008-12-28",
1763                                  "2008-12-29",
1764                                  "2012-01-01 01:02:03"])
1765     nonexistent = pd.to_datetime(["2015-03-29 02:30:00",
1766                                   "2015-03-29 03:30:00"])
1767     ambiguous = pd.to_datetime(["2018-10-28 01:20:00",
1768                                 "2018-10-28 02:36:00",
1769                                 "2018-10-28 03:46:00"])
1770     ambiguous_array = pa.array(ambiguous, type=ts_type)
1771     nonexistent_array = pa.array(nonexistent, type=ts_type)
1772
1773     for timezone in ["UTC", "US/Central", "Asia/Kolkata"]:
1774         options = pc.AssumeTimezoneOptions(timezone)
1775         ta = pa.array(timestamps, type=ts_type)
1776         expected = timestamps.tz_localize(timezone)
1777         result = pc.assume_timezone(ta, options=options)
1778         assert result.equals(pa.array(expected))
1779
1780         ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone))
1781         with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"):
1782             pc.assume_timezone(ta_zoned, options=options)
1783
1784     invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss")
1785     with pytest.raises(ValueError, match="not found in timezone database"):
1786         pc.assume_timezone(ta, options=invalid_options)
1787
1788     timezone = "Europe/Brussels"
1789
1790     # nonexistent parameter was introduced in Pandas 0.24.0
1791     if Version(pd.__version__) >= Version("0.24.0"):
1792         options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
1793         options_nonexistent_earliest = pc.AssumeTimezoneOptions(
1794             timezone, ambiguous="raise", nonexistent="earliest")
1795         options_nonexistent_latest = pc.AssumeTimezoneOptions(
1796             timezone, ambiguous="raise", nonexistent="latest")
1797
1798         with pytest.raises(ValueError,
1799                            match="Timestamp doesn't exist in "
1800                                  f"timezone '{timezone}'"):
1801             pc.assume_timezone(nonexistent_array,
1802                                options=options_nonexistent_raise)
1803
1804         expected = pa.array(nonexistent.tz_localize(
1805             timezone, nonexistent="shift_forward"))
1806         result = pc.assume_timezone(
1807             nonexistent_array, options=options_nonexistent_latest)
1808         expected.equals(result)
1809
1810         expected = pa.array(nonexistent.tz_localize(
1811             timezone, nonexistent="shift_backward"))
1812         result = pc.assume_timezone(
1813             nonexistent_array, options=options_nonexistent_earliest)
1814         expected.equals(result)
1815
1816     options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
1817     options_ambiguous_latest = pc.AssumeTimezoneOptions(
1818         timezone, ambiguous="latest", nonexistent="raise")
1819     options_ambiguous_earliest = pc.AssumeTimezoneOptions(
1820         timezone, ambiguous="earliest", nonexistent="raise")
1821
1822     with pytest.raises(ValueError,
1823                        match="Timestamp is ambiguous in "
1824                              f"timezone '{timezone}'"):
1825         pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise)
1826
1827     expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True])
1828     result = pc.assume_timezone(
1829         ambiguous_array, options=options_ambiguous_earliest)
1830     result.equals(pa.array(expected))
1831
1832     expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False])
1833     result = pc.assume_timezone(
1834         ambiguous_array, options=options_ambiguous_latest)
1835     result.equals(pa.array(expected))
1836
1837
1838 def test_count():
1839     arr = pa.array([1, 2, 3, None, None])
1840     assert pc.count(arr).as_py() == 3
1841     assert pc.count(arr, mode='only_valid').as_py() == 3
1842     assert pc.count(arr, mode='only_null').as_py() == 2
1843     assert pc.count(arr, mode='all').as_py() == 5
1844
1845
1846 def test_index():
1847     arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
1848     assert pc.index(arr, pa.scalar(0)).as_py() == 0
1849     assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1
1850     assert pc.index(arr, 4).as_py() == 4
1851     assert arr.index(3, start=2).as_py() == 3
1852     assert arr.index(None).as_py() == -1
1853
1854     arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64())
1855     assert arr.index(1).as_py() == 0
1856     assert arr.index(1, start=2).as_py() == 2
1857     assert arr.index(1, start=1, end=2).as_py() == -1
1858
1859
1860 def check_partition_nth(data, indices, pivot, null_placement):
1861     indices = indices.to_pylist()
1862     assert len(indices) == len(data)
1863     assert sorted(indices) == list(range(len(data)))
1864     until_pivot = [data[indices[i]] for i in range(pivot)]
1865     after_pivot = [data[indices[i]] for i in range(pivot, len(data))]
1866     p = data[indices[pivot]]
1867     if p is None:
1868         if null_placement == "at_start":
1869             assert all(v is None for v in until_pivot)
1870         else:
1871             assert all(v is None for v in after_pivot)
1872     else:
1873         if null_placement == "at_start":
1874             assert all(v is None or v <= p for v in until_pivot)
1875             assert all(v >= p for v in after_pivot)
1876         else:
1877             assert all(v <= p for v in until_pivot)
1878             assert all(v is None or v >= p for v in after_pivot)
1879
1880
1881 def test_partition_nth():
1882     data = list(range(100, 140))
1883     random.shuffle(data)
1884     pivot = 10
1885     indices = pc.partition_nth_indices(data, pivot=pivot)
1886     check_partition_nth(data, indices, pivot, "at_end")
1887
1888
1889 def test_partition_nth_null_placement():
1890     data = list(range(10)) + [None] * 10
1891     random.shuffle(data)
1892
1893     for pivot in (0, 7, 13, 19):
1894         for null_placement in ("at_start", "at_end"):
1895             indices = pc.partition_nth_indices(data, pivot=pivot,
1896                                                null_placement=null_placement)
1897             check_partition_nth(data, indices, pivot, null_placement)
1898
1899
1900 def test_select_k_array():
1901     def validate_select_k(select_k_indices, arr, order, stable_sort=False):
1902         sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)])
1903         head_k_indices = sorted_indices.slice(0, len(select_k_indices))
1904         if stable_sort:
1905             assert select_k_indices == head_k_indices
1906         else:
1907             expected = pc.take(arr, head_k_indices)
1908             actual = pc.take(arr, select_k_indices)
1909             assert actual == expected
1910
1911     arr = pa.array([1, 2, None, 0])
1912     for k in [0, 2, 4]:
1913         for order in ["descending", "ascending"]:
1914             result = pc.select_k_unstable(
1915                 arr, k=k, sort_keys=[("dummy", order)])
1916             validate_select_k(result, arr, order)
1917
1918         result = pc.top_k_unstable(arr, k=k)
1919         validate_select_k(result, arr, "descending")
1920
1921         result = pc.bottom_k_unstable(arr, k=k)
1922         validate_select_k(result, arr, "ascending")
1923
1924     result = pc.select_k_unstable(
1925         arr, options=pc.SelectKOptions(
1926             k=2, sort_keys=[("dummy", "descending")])
1927     )
1928     validate_select_k(result, arr, "descending")
1929
1930     result = pc.select_k_unstable(
1931         arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")])
1932     )
1933     validate_select_k(result, arr, "ascending")
1934
1935
1936 def test_select_k_table():
1937     def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False):
1938         sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys)
1939         head_k_indices = sorted_indices.slice(0, len(select_k_indices))
1940         if stable_sort:
1941             assert select_k_indices == head_k_indices
1942         else:
1943             expected = pc.take(tbl, head_k_indices)
1944             actual = pc.take(tbl, select_k_indices)
1945             assert actual == expected
1946
1947     table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]})
1948     for k in [0, 2, 4]:
1949         result = pc.select_k_unstable(
1950             table, k=k, sort_keys=[("a", "ascending")])
1951         validate_select_k(result, table, sort_keys=[("a", "ascending")])
1952
1953         result = pc.select_k_unstable(
1954             table, k=k, sort_keys=[("a", "ascending"), ("b", "ascending")])
1955         validate_select_k(
1956             result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
1957
1958         result = pc.top_k_unstable(table, k=k, sort_keys=["a"])
1959         validate_select_k(result, table, sort_keys=[("a", "descending")])
1960
1961         result = pc.bottom_k_unstable(table, k=k, sort_keys=["a", "b"])
1962         validate_select_k(
1963             result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
1964
1965     with pytest.raises(ValueError,
1966                        match="select_k_unstable requires a nonnegative `k`"):
1967         pc.select_k_unstable(table)
1968
1969     with pytest.raises(ValueError,
1970                        match="select_k_unstable requires a "
1971                              "non-empty `sort_keys`"):
1972         pc.select_k_unstable(table, k=2, sort_keys=[])
1973
1974     with pytest.raises(ValueError, match="not a valid sort order"):
1975         pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")])
1976
1977     with pytest.raises(ValueError, match="Nonexistent sort key column"):
1978         pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")])
1979
1980
1981 def test_array_sort_indices():
1982     arr = pa.array([1, 2, None, 0])
1983     result = pc.array_sort_indices(arr)
1984     assert result.to_pylist() == [3, 0, 1, 2]
1985     result = pc.array_sort_indices(arr, order="ascending")
1986     assert result.to_pylist() == [3, 0, 1, 2]
1987     result = pc.array_sort_indices(arr, order="descending")
1988     assert result.to_pylist() == [1, 0, 3, 2]
1989     result = pc.array_sort_indices(arr, order="descending",
1990                                    null_placement="at_start")
1991     assert result.to_pylist() == [2, 1, 0, 3]
1992
1993     with pytest.raises(ValueError, match="not a valid sort order"):
1994         pc.array_sort_indices(arr, order="nonscending")
1995
1996
1997 def test_sort_indices_array():
1998     arr = pa.array([1, 2, None, 0])
1999     result = pc.sort_indices(arr)
2000     assert result.to_pylist() == [3, 0, 1, 2]
2001     result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")])
2002     assert result.to_pylist() == [3, 0, 1, 2]
2003     result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")])
2004     assert result.to_pylist() == [1, 0, 3, 2]
2005     result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")],
2006                              null_placement="at_start")
2007     assert result.to_pylist() == [2, 1, 0, 3]
2008     # Using SortOptions
2009     result = pc.sort_indices(
2010         arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")])
2011     )
2012     assert result.to_pylist() == [1, 0, 3, 2]
2013     result = pc.sort_indices(
2014         arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")],
2015                                     null_placement="at_start")
2016     )
2017     assert result.to_pylist() == [2, 1, 0, 3]
2018
2019
2020 def test_sort_indices_table():
2021     table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]})
2022
2023     result = pc.sort_indices(table, sort_keys=[("a", "ascending")])
2024     assert result.to_pylist() == [3, 0, 1, 2]
2025     result = pc.sort_indices(table, sort_keys=[("a", "ascending")],
2026                              null_placement="at_start")
2027     assert result.to_pylist() == [2, 3, 0, 1]
2028
2029     result = pc.sort_indices(
2030         table, sort_keys=[("a", "descending"), ("b", "ascending")]
2031     )
2032     assert result.to_pylist() == [1, 0, 3, 2]
2033     result = pc.sort_indices(
2034         table, sort_keys=[("a", "descending"), ("b", "ascending")],
2035         null_placement="at_start"
2036     )
2037     assert result.to_pylist() == [2, 1, 0, 3]
2038
2039     with pytest.raises(ValueError, match="Must specify one or more sort keys"):
2040         pc.sort_indices(table)
2041
2042     with pytest.raises(ValueError, match="Nonexistent sort key column"):
2043         pc.sort_indices(table, sort_keys=[("unknown", "ascending")])
2044
2045     with pytest.raises(ValueError, match="not a valid sort order"):
2046         pc.sort_indices(table, sort_keys=[("a", "nonscending")])
2047
2048
2049 def test_is_in():
2050     arr = pa.array([1, 2, None, 1, 2, 3])
2051
2052     result = pc.is_in(arr, value_set=pa.array([1, 3, None]))
2053     assert result.to_pylist() == [True, False, True, True, False, True]
2054
2055     result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True)
2056     assert result.to_pylist() == [True, False, False, True, False, True]
2057
2058     result = pc.is_in(arr, value_set=pa.array([1, 3]))
2059     assert result.to_pylist() == [True, False, False, True, False, True]
2060
2061     result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
2062     assert result.to_pylist() == [True, False, False, True, False, True]
2063
2064
2065 def test_index_in():
2066     arr = pa.array([1, 2, None, 1, 2, 3])
2067
2068     result = pc.index_in(arr, value_set=pa.array([1, 3, None]))
2069     assert result.to_pylist() == [0, None, 2, 0, None, 1]
2070
2071     result = pc.index_in(arr, value_set=pa.array([1, 3, None]),
2072                          skip_nulls=True)
2073     assert result.to_pylist() == [0, None, None, 0, None, 1]
2074
2075     result = pc.index_in(arr, value_set=pa.array([1, 3]))
2076     assert result.to_pylist() == [0, None, None, 0, None, 1]
2077
2078     result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
2079     assert result.to_pylist() == [0, None, None, 0, None, 1]
2080
2081
2082 def test_quantile():
2083     arr = pa.array([1, 2, 3, 4])
2084
2085     result = pc.quantile(arr)
2086     assert result.to_pylist() == [2.5]
2087
2088     result = pc.quantile(arr, interpolation='lower')
2089     assert result.to_pylist() == [2]
2090     result = pc.quantile(arr, interpolation='higher')
2091     assert result.to_pylist() == [3]
2092     result = pc.quantile(arr, interpolation='nearest')
2093     assert result.to_pylist() == [3]
2094     result = pc.quantile(arr, interpolation='midpoint')
2095     assert result.to_pylist() == [2.5]
2096     result = pc.quantile(arr, interpolation='linear')
2097     assert result.to_pylist() == [2.5]
2098
2099     arr = pa.array([1, 2])
2100
2101     result = pc.quantile(arr, q=[0.25, 0.5, 0.75])
2102     assert result.to_pylist() == [1.25, 1.5, 1.75]
2103
2104     result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower')
2105     assert result.to_pylist() == [1, 1, 1]
2106     result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher')
2107     assert result.to_pylist() == [2, 2, 2]
2108     result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint')
2109     assert result.to_pylist() == [1.5, 1.5, 1.5]
2110     result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest')
2111     assert result.to_pylist() == [1, 1, 2]
2112     result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear')
2113     assert result.to_pylist() == [1.25, 1.5, 1.75]
2114
2115     with pytest.raises(ValueError, match="Quantile must be between 0 and 1"):
2116         pc.quantile(arr, q=1.1)
2117     with pytest.raises(ValueError, match="not a valid quantile interpolation"):
2118         pc.quantile(arr, interpolation='zzz')
2119
2120
2121 def test_tdigest():
2122     arr = pa.array([1, 2, 3, 4])
2123     result = pc.tdigest(arr)
2124     assert result.to_pylist() == [2.5]
2125
2126     arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
2127     result = pc.tdigest(arr)
2128     assert result.to_pylist() == [2.5]
2129
2130     arr = pa.array([1, 2, 3, 4])
2131     result = pc.tdigest(arr, q=[0, 0.5, 1])
2132     assert result.to_pylist() == [1, 2.5, 4]
2133
2134     arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
2135     result = pc.tdigest(arr, q=[0, 0.5, 1])
2136     assert result.to_pylist() == [1, 2.5, 4]
2137
2138
2139 def test_fill_null_segfault():
2140     # ARROW-12672
2141     arr = pa.array([None], pa.bool_()).fill_null(False)
2142     result = arr.cast(pa.int8())
2143     assert result == pa.array([0], pa.int8())
2144
2145
2146 def test_min_max_element_wise():
2147     arr1 = pa.array([1, 2, 3])
2148     arr2 = pa.array([3, 1, 2])
2149     arr3 = pa.array([2, 3, None])
2150
2151     result = pc.max_element_wise(arr1, arr2)
2152     assert result == pa.array([3, 2, 3])
2153     result = pc.min_element_wise(arr1, arr2)
2154     assert result == pa.array([1, 1, 2])
2155
2156     result = pc.max_element_wise(arr1, arr2, arr3)
2157     assert result == pa.array([3, 3, 3])
2158     result = pc.min_element_wise(arr1, arr2, arr3)
2159     assert result == pa.array([1, 1, 2])
2160
2161     # with specifying the option
2162     result = pc.max_element_wise(arr1, arr3, skip_nulls=True)
2163     assert result == pa.array([2, 3, 3])
2164     result = pc.min_element_wise(arr1, arr3, skip_nulls=True)
2165     assert result == pa.array([1, 2, 3])
2166     result = pc.max_element_wise(
2167         arr1, arr3, options=pc.ElementWiseAggregateOptions())
2168     assert result == pa.array([2, 3, 3])
2169     result = pc.min_element_wise(
2170         arr1, arr3, options=pc.ElementWiseAggregateOptions())
2171     assert result == pa.array([1, 2, 3])
2172
2173     # not skipping nulls
2174     result = pc.max_element_wise(arr1, arr3, skip_nulls=False)
2175     assert result == pa.array([2, 3, None])
2176     result = pc.min_element_wise(arr1, arr3, skip_nulls=False)
2177     assert result == pa.array([1, 2, None])
2178
2179
2180 def test_make_struct():
2181     assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'}
2182
2183     assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == {
2184         'i': 1, 's': 'a'}
2185
2186     assert pc.make_struct([1, 2, 3],
2187                           "a b c".split()) == pa.StructArray.from_arrays([
2188                               [1, 2, 3],
2189                               "a b c".split()], names='0 1'.split())
2190
2191     with pytest.raises(ValueError,
2192                        match="Array arguments must all be the same length"):
2193         pc.make_struct([1, 2, 3, 4], "a b c".split())
2194
2195     with pytest.raises(ValueError, match="0 arguments but 2 field names"):
2196         pc.make_struct(field_names=['one', 'two'])
2197
2198
2199 def test_case_when():
2200     assert pc.case_when(pc.make_struct([True, False, None],
2201                                        [False, True, None]),
2202                         [1, 2, 3],
2203                         [11, 12, 13]) == pa.array([1, 12, None])
2204
2205
2206 def test_list_element():
2207     element_type = pa.struct([('a', pa.float64()), ('b', pa.int8())])
2208     list_type = pa.list_(element_type)
2209     l1 = [{'a': .4, 'b': 2}, None, {'a': .2, 'b': 4}, None, {'a': 5.6, 'b': 6}]
2210     l2 = [None, {'a': .52, 'b': 3}, {'a': .7, 'b': 4}, None, {'a': .6, 'b': 8}]
2211     lists = pa.array([l1, l2], list_type)
2212
2213     index = 1
2214     result = pa.compute.list_element(lists, index)
2215     expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type)
2216     assert result.equals(expected)
2217
2218     index = 4
2219     result = pa.compute.list_element(lists, index)
2220     expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type)
2221     assert result.equals(expected)
2222
2223
2224 def test_count_distinct():
2225     seed = datetime.now()
2226     samples = [seed.replace(year=y) for y in range(1992, 2092)]
2227     arr = pa.array(samples, pa.timestamp("ns"))
2228     result = pa.compute.count_distinct(arr)
2229     expected = pa.scalar(len(samples), type=pa.int64())
2230     assert result.equals(expected)
2231
2232
2233 def test_count_distinct_options():
2234     arr = pa.array([1, 2, 3, None, None])
2235     assert pc.count_distinct(arr).as_py() == 3
2236     assert pc.count_distinct(arr, mode='only_valid').as_py() == 3
2237     assert pc.count_distinct(arr, mode='only_null').as_py() == 1
2238     assert pc.count_distinct(arr, mode='all').as_py() == 4