]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | from datetime import datetime | |
19 | from functools import lru_cache, partial | |
20 | import inspect | |
21 | import pickle | |
22 | import pytest | |
23 | import random | |
24 | import sys | |
25 | import textwrap | |
26 | ||
27 | import numpy as np | |
28 | ||
29 | try: | |
30 | import pandas as pd | |
31 | except ImportError: | |
32 | pd = None | |
33 | ||
34 | import pyarrow as pa | |
35 | import pyarrow.compute as pc | |
36 | ||
37 | all_array_types = [ | |
38 | ('bool', [True, False, False, True, True]), | |
39 | ('uint8', np.arange(5)), | |
40 | ('int8', np.arange(5)), | |
41 | ('uint16', np.arange(5)), | |
42 | ('int16', np.arange(5)), | |
43 | ('uint32', np.arange(5)), | |
44 | ('int32', np.arange(5)), | |
45 | ('uint64', np.arange(5, 10)), | |
46 | ('int64', np.arange(5, 10)), | |
47 | ('float', np.arange(0, 0.5, 0.1)), | |
48 | ('double', np.arange(0, 0.5, 0.1)), | |
49 | ('string', ['a', 'b', None, 'ddd', 'ee']), | |
50 | ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), | |
51 | (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), | |
52 | (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), | |
53 | (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), | |
54 | (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ | |
55 | {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), | |
56 | ] | |
57 | ||
58 | exported_functions = [ | |
59 | func for (name, func) in sorted(pc.__dict__.items()) | |
60 | if hasattr(func, '__arrow_compute_function__')] | |
61 | ||
62 | exported_option_classes = [ | |
63 | cls for (name, cls) in sorted(pc.__dict__.items()) | |
64 | if (isinstance(cls, type) and | |
65 | cls is not pc.FunctionOptions and | |
66 | issubclass(cls, pc.FunctionOptions))] | |
67 | ||
68 | numerical_arrow_types = [ | |
69 | pa.int8(), | |
70 | pa.int16(), | |
71 | pa.int64(), | |
72 | pa.uint8(), | |
73 | pa.uint16(), | |
74 | pa.uint64(), | |
75 | pa.float32(), | |
76 | pa.float64() | |
77 | ] | |
78 | ||
79 | ||
80 | def test_exported_functions(): | |
81 | # Check that all exported concrete functions can be called with | |
82 | # the right number of arguments. | |
83 | # Note that unregistered functions (e.g. with a mismatching name) | |
84 | # will raise KeyError. | |
85 | functions = exported_functions | |
86 | assert len(functions) >= 10 | |
87 | for func in functions: | |
88 | arity = func.__arrow_compute_function__['arity'] | |
89 | if arity is Ellipsis: | |
90 | args = [object()] * 3 | |
91 | else: | |
92 | args = [object()] * arity | |
93 | with pytest.raises(TypeError, | |
94 | match="Got unexpected argument type " | |
95 | "<class 'object'> for compute function"): | |
96 | func(*args) | |
97 | ||
98 | ||
99 | def test_exported_option_classes(): | |
100 | classes = exported_option_classes | |
101 | assert len(classes) >= 10 | |
102 | for cls in classes: | |
103 | # Option classes must have an introspectable constructor signature, | |
104 | # and that signature should not have any *args or **kwargs. | |
105 | sig = inspect.signature(cls) | |
106 | for param in sig.parameters.values(): | |
107 | assert param.kind not in (param.VAR_POSITIONAL, | |
108 | param.VAR_KEYWORD) | |
109 | ||
110 | ||
111 | def test_option_class_equality(): | |
112 | options = [ | |
113 | pc.ArraySortOptions(), | |
114 | pc.AssumeTimezoneOptions("UTC"), | |
115 | pc.CastOptions.safe(pa.int8()), | |
116 | pc.CountOptions(), | |
117 | pc.DayOfWeekOptions(count_from_zero=False, week_start=0), | |
118 | pc.DictionaryEncodeOptions(), | |
119 | pc.ElementWiseAggregateOptions(skip_nulls=True), | |
120 | pc.ExtractRegexOptions("pattern"), | |
121 | pc.FilterOptions(), | |
122 | pc.IndexOptions(pa.scalar(1)), | |
123 | pc.JoinOptions(), | |
124 | pc.MakeStructOptions(["field", "names"], | |
125 | field_nullability=[True, True], | |
126 | field_metadata=[pa.KeyValueMetadata({"a": "1"}), | |
127 | pa.KeyValueMetadata({"b": "2"})]), | |
128 | pc.MatchSubstringOptions("pattern"), | |
129 | pc.ModeOptions(), | |
130 | pc.NullOptions(), | |
131 | pc.PadOptions(5), | |
132 | pc.PartitionNthOptions(1, null_placement="at_start"), | |
133 | pc.QuantileOptions(), | |
134 | pc.ReplaceSliceOptions(0, 1, "a"), | |
135 | pc.ReplaceSubstringOptions("a", "b"), | |
136 | pc.RoundOptions(2, "towards_infinity"), | |
137 | pc.RoundToMultipleOptions(100, "towards_infinity"), | |
138 | pc.ScalarAggregateOptions(), | |
139 | pc.SelectKOptions(0, sort_keys=[("b", "ascending")]), | |
140 | pc.SetLookupOptions(pa.array([1])), | |
141 | pc.SliceOptions(0, 1, 1), | |
142 | pc.SortOptions([("dummy", "descending")], null_placement="at_start"), | |
143 | pc.SplitOptions(), | |
144 | pc.SplitPatternOptions("pattern"), | |
145 | pc.StrftimeOptions(), | |
146 | pc.StrptimeOptions("%Y", "s"), | |
147 | pc.TakeOptions(), | |
148 | pc.TDigestOptions(), | |
149 | pc.TrimOptions(" "), | |
150 | pc.VarianceOptions(), | |
151 | pc.WeekOptions(week_starts_monday=True, count_from_zero=False, | |
152 | first_week_is_fully_in_year=False), | |
153 | ] | |
154 | # TODO: We should test on windows once ARROW-13168 is resolved. | |
155 | # Timezone database is not available on Windows yet | |
156 | if sys.platform != 'win32': | |
157 | options.append(pc.AssumeTimezoneOptions("Europe/Ljubljana")) | |
158 | ||
159 | classes = {type(option) for option in options} | |
160 | for cls in exported_option_classes: | |
161 | # Timezone database is not available on Windows yet | |
162 | if cls not in classes and sys.platform != 'win32' and \ | |
163 | cls != pc.AssumeTimezoneOptions: | |
164 | try: | |
165 | options.append(cls()) | |
166 | except TypeError: | |
167 | pytest.fail(f"Options class is not tested: {cls}") | |
168 | for option in options: | |
169 | assert option == option | |
170 | assert repr(option).startswith(option.__class__.__name__) | |
171 | buf = option.serialize() | |
172 | deserialized = pc.FunctionOptions.deserialize(buf) | |
173 | assert option == deserialized | |
174 | assert repr(option) == repr(deserialized) | |
175 | for option1, option2 in zip(options, options[1:]): | |
176 | assert option1 != option2 | |
177 | ||
178 | assert repr(pc.IndexOptions(pa.scalar(1))) == "IndexOptions(value=int64:1)" | |
179 | assert repr(pc.ArraySortOptions()) == \ | |
180 | "ArraySortOptions(order=Ascending, null_placement=AtEnd)" | |
181 | ||
182 | ||
183 | def test_list_functions(): | |
184 | assert len(pc.list_functions()) > 10 | |
185 | assert "add" in pc.list_functions() | |
186 | ||
187 | ||
188 | def _check_get_function(name, expected_func_cls, expected_ker_cls, | |
189 | min_num_kernels=1): | |
190 | func = pc.get_function(name) | |
191 | assert isinstance(func, expected_func_cls) | |
192 | n = func.num_kernels | |
193 | assert n >= min_num_kernels | |
194 | assert n == len(func.kernels) | |
195 | assert all(isinstance(ker, expected_ker_cls) for ker in func.kernels) | |
196 | ||
197 | ||
198 | def test_get_function_scalar(): | |
199 | _check_get_function("add", pc.ScalarFunction, pc.ScalarKernel, 8) | |
200 | ||
201 | ||
202 | def test_get_function_vector(): | |
203 | _check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8) | |
204 | ||
205 | ||
206 | def test_get_function_scalar_aggregate(): | |
207 | _check_get_function("mean", pc.ScalarAggregateFunction, | |
208 | pc.ScalarAggregateKernel, 8) | |
209 | ||
210 | ||
211 | def test_get_function_hash_aggregate(): | |
212 | _check_get_function("hash_sum", pc.HashAggregateFunction, | |
213 | pc.HashAggregateKernel, 1) | |
214 | ||
215 | ||
216 | def test_call_function_with_memory_pool(): | |
217 | arr = pa.array(["foo", "bar", "baz"]) | |
218 | indices = np.array([2, 2, 1]) | |
219 | result1 = arr.take(indices) | |
220 | result2 = pc.call_function('take', [arr, indices], | |
221 | memory_pool=pa.default_memory_pool()) | |
222 | expected = pa.array(["baz", "baz", "bar"]) | |
223 | assert result1.equals(expected) | |
224 | assert result2.equals(expected) | |
225 | ||
226 | result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool()) | |
227 | assert result3.equals(expected) | |
228 | ||
229 | ||
230 | def test_pickle_functions(): | |
231 | # Pickle registered functions | |
232 | for name in pc.list_functions(): | |
233 | func = pc.get_function(name) | |
234 | reconstructed = pickle.loads(pickle.dumps(func)) | |
235 | assert type(reconstructed) is type(func) | |
236 | assert reconstructed.name == func.name | |
237 | assert reconstructed.arity == func.arity | |
238 | assert reconstructed.num_kernels == func.num_kernels | |
239 | ||
240 | ||
241 | def test_pickle_global_functions(): | |
242 | # Pickle global wrappers (manual or automatic) of registered functions | |
243 | for name in pc.list_functions(): | |
244 | func = getattr(pc, name) | |
245 | reconstructed = pickle.loads(pickle.dumps(func)) | |
246 | assert reconstructed is func | |
247 | ||
248 | ||
249 | def test_function_attributes(): | |
250 | # Sanity check attributes of registered functions | |
251 | for name in pc.list_functions(): | |
252 | func = pc.get_function(name) | |
253 | assert isinstance(func, pc.Function) | |
254 | assert func.name == name | |
255 | kernels = func.kernels | |
256 | assert func.num_kernels == len(kernels) | |
257 | assert all(isinstance(ker, pc.Kernel) for ker in kernels) | |
258 | if func.arity is not Ellipsis: | |
259 | assert func.arity >= 1 | |
260 | repr(func) | |
261 | for ker in kernels: | |
262 | repr(ker) | |
263 | ||
264 | ||
265 | def test_input_type_conversion(): | |
266 | # Automatic array conversion from Python | |
267 | arr = pc.add([1, 2], [4, None]) | |
268 | assert arr.to_pylist() == [5, None] | |
269 | # Automatic scalar conversion from Python | |
270 | arr = pc.add([1, 2], 4) | |
271 | assert arr.to_pylist() == [5, 6] | |
272 | # Other scalar type | |
273 | assert pc.equal(["foo", "bar", None], | |
274 | "foo").to_pylist() == [True, False, None] | |
275 | ||
276 | ||
277 | @pytest.mark.parametrize('arrow_type', numerical_arrow_types) | |
278 | def test_sum_array(arrow_type): | |
279 | arr = pa.array([1, 2, 3, 4], type=arrow_type) | |
280 | assert arr.sum().as_py() == 10 | |
281 | assert pc.sum(arr).as_py() == 10 | |
282 | ||
283 | arr = pa.array([1, 2, 3, 4, None], type=arrow_type) | |
284 | assert arr.sum().as_py() == 10 | |
285 | assert pc.sum(arr).as_py() == 10 | |
286 | ||
287 | arr = pa.array([None], type=arrow_type) | |
288 | assert arr.sum().as_py() is None # noqa: E711 | |
289 | assert pc.sum(arr).as_py() is None # noqa: E711 | |
290 | assert arr.sum(min_count=0).as_py() == 0 | |
291 | assert pc.sum(arr, min_count=0).as_py() == 0 | |
292 | ||
293 | arr = pa.array([], type=arrow_type) | |
294 | assert arr.sum().as_py() is None # noqa: E711 | |
295 | assert arr.sum(min_count=0).as_py() == 0 | |
296 | assert pc.sum(arr, min_count=0).as_py() == 0 | |
297 | ||
298 | ||
299 | @pytest.mark.parametrize('arrow_type', numerical_arrow_types) | |
300 | def test_sum_chunked_array(arrow_type): | |
301 | arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)]) | |
302 | assert pc.sum(arr).as_py() == 10 | |
303 | ||
304 | arr = pa.chunked_array([ | |
305 | pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type) | |
306 | ]) | |
307 | assert pc.sum(arr).as_py() == 10 | |
308 | ||
309 | arr = pa.chunked_array([ | |
310 | pa.array([1, 2], type=arrow_type), | |
311 | pa.array([], type=arrow_type), | |
312 | pa.array([3, 4], type=arrow_type) | |
313 | ]) | |
314 | assert pc.sum(arr).as_py() == 10 | |
315 | ||
316 | arr = pa.chunked_array((), type=arrow_type) | |
317 | assert arr.num_chunks == 0 | |
318 | assert pc.sum(arr).as_py() is None # noqa: E711 | |
319 | assert pc.sum(arr, min_count=0).as_py() == 0 | |
320 | ||
321 | ||
322 | def test_mode_array(): | |
323 | # ARROW-9917 | |
324 | arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') | |
325 | mode = pc.mode(arr) | |
326 | assert len(mode) == 1 | |
327 | assert mode[0].as_py() == {"mode": 1, "count": 2} | |
328 | ||
329 | mode = pc.mode(arr, n=2) | |
330 | assert len(mode) == 2 | |
331 | assert mode[0].as_py() == {"mode": 1, "count": 2} | |
332 | assert mode[1].as_py() == {"mode": 3, "count": 2} | |
333 | ||
334 | arr = pa.array([], type='int64') | |
335 | assert len(pc.mode(arr)) == 0 | |
336 | ||
337 | arr = pa.array([1, 1, 3, 4, 3, None], type='int64') | |
338 | mode = pc.mode(arr, skip_nulls=False) | |
339 | assert len(mode) == 0 | |
340 | mode = pc.mode(arr, min_count=6) | |
341 | assert len(mode) == 0 | |
342 | mode = pc.mode(arr, skip_nulls=False, min_count=5) | |
343 | assert len(mode) == 0 | |
344 | ||
345 | ||
346 | def test_mode_chunked_array(): | |
347 | # ARROW-9917 | |
348 | arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) | |
349 | mode = pc.mode(arr) | |
350 | assert len(mode) == 1 | |
351 | assert mode[0].as_py() == {"mode": 1, "count": 2} | |
352 | ||
353 | mode = pc.mode(arr, n=2) | |
354 | assert len(mode) == 2 | |
355 | assert mode[0].as_py() == {"mode": 1, "count": 2} | |
356 | assert mode[1].as_py() == {"mode": 3, "count": 2} | |
357 | ||
358 | arr = pa.chunked_array((), type='int64') | |
359 | assert arr.num_chunks == 0 | |
360 | assert len(pc.mode(arr)) == 0 | |
361 | ||
362 | ||
363 | def test_variance(): | |
364 | data = [1, 2, 3, 4, 5, 6, 7, 8] | |
365 | assert pc.variance(data).as_py() == 5.25 | |
366 | assert pc.variance(data, ddof=0).as_py() == 5.25 | |
367 | assert pc.variance(data, ddof=1).as_py() == 6.0 | |
368 | ||
369 | ||
370 | def test_count_substring(): | |
371 | for (ty, offset) in [(pa.string(), pa.int32()), | |
372 | (pa.large_string(), pa.int64())]: | |
373 | arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty) | |
374 | ||
375 | result = pc.count_substring(arr, "ab") | |
376 | expected = pa.array([1, 1, 2, 0, 0, None], type=offset) | |
377 | assert expected.equals(result) | |
378 | ||
379 | result = pc.count_substring(arr, "ab", ignore_case=True) | |
380 | expected = pa.array([1, 1, 2, 0, 1, None], type=offset) | |
381 | assert expected.equals(result) | |
382 | ||
383 | ||
384 | def test_count_substring_regex(): | |
385 | for (ty, offset) in [(pa.string(), pa.int32()), | |
386 | (pa.large_string(), pa.int64())]: | |
387 | arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty) | |
388 | ||
389 | result = pc.count_substring_regex(arr, "a+") | |
390 | expected = pa.array([1, 1, 3, 1, 0, None], type=offset) | |
391 | assert expected.equals(result) | |
392 | ||
393 | result = pc.count_substring_regex(arr, "a+", ignore_case=True) | |
394 | expected = pa.array([1, 1, 2, 1, 1, None], type=offset) | |
395 | assert expected.equals(result) | |
396 | ||
397 | ||
398 | def test_find_substring(): | |
399 | for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]: | |
400 | arr = pa.array(["ab", "cab", "ba", None], type=ty) | |
401 | result = pc.find_substring(arr, "ab") | |
402 | assert result.to_pylist() == [0, 1, -1, None] | |
403 | ||
404 | result = pc.find_substring_regex(arr, "a?b") | |
405 | assert result.to_pylist() == [0, 1, 0, None] | |
406 | ||
407 | arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty) | |
408 | result = pc.find_substring(arr, "aB*", ignore_case=True) | |
409 | assert result.to_pylist() == [0, 1, -1, -1] | |
410 | ||
411 | result = pc.find_substring_regex(arr, "a?b", ignore_case=True) | |
412 | assert result.to_pylist() == [0, 1, 0, 0] | |
413 | ||
414 | ||
415 | def test_match_like(): | |
416 | arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) | |
417 | result = pc.match_like(arr, r"_a\%%") | |
418 | expected = pa.array([False, True, False, True, None]) | |
419 | assert expected.equals(result) | |
420 | ||
421 | arr = pa.array(["aB", "bA%", "ba", "ca%d", None]) | |
422 | result = pc.match_like(arr, r"_a\%%", ignore_case=True) | |
423 | expected = pa.array([False, True, False, True, None]) | |
424 | assert expected.equals(result) | |
425 | result = pc.match_like(arr, r"_a\%%", ignore_case=False) | |
426 | expected = pa.array([False, False, False, True, None]) | |
427 | assert expected.equals(result) | |
428 | ||
429 | ||
430 | def test_match_substring(): | |
431 | arr = pa.array(["ab", "abc", "ba", None]) | |
432 | result = pc.match_substring(arr, "ab") | |
433 | expected = pa.array([True, True, False, None]) | |
434 | assert expected.equals(result) | |
435 | ||
436 | arr = pa.array(["áB", "Ábc", "ba", None]) | |
437 | result = pc.match_substring(arr, "áb", ignore_case=True) | |
438 | expected = pa.array([True, True, False, None]) | |
439 | assert expected.equals(result) | |
440 | result = pc.match_substring(arr, "áb", ignore_case=False) | |
441 | expected = pa.array([False, False, False, None]) | |
442 | assert expected.equals(result) | |
443 | ||
444 | ||
445 | def test_match_substring_regex(): | |
446 | arr = pa.array(["ab", "abc", "ba", "c", None]) | |
447 | result = pc.match_substring_regex(arr, "^a?b") | |
448 | expected = pa.array([True, True, True, False, None]) | |
449 | assert expected.equals(result) | |
450 | ||
451 | arr = pa.array(["aB", "Abc", "BA", "c", None]) | |
452 | result = pc.match_substring_regex(arr, "^a?b", ignore_case=True) | |
453 | expected = pa.array([True, True, True, False, None]) | |
454 | assert expected.equals(result) | |
455 | result = pc.match_substring_regex(arr, "^a?b", ignore_case=False) | |
456 | expected = pa.array([False, False, False, False, None]) | |
457 | assert expected.equals(result) | |
458 | ||
459 | ||
460 | def test_trim(): | |
461 | # \u3000 is unicode whitespace | |
462 | arr = pa.array([" foo", None, " \u3000foo bar \t"]) | |
463 | result = pc.utf8_trim_whitespace(arr) | |
464 | expected = pa.array(["foo", None, "foo bar"]) | |
465 | assert expected.equals(result) | |
466 | ||
467 | arr = pa.array([" foo", None, " \u3000foo bar \t"]) | |
468 | result = pc.ascii_trim_whitespace(arr) | |
469 | expected = pa.array(["foo", None, "\u3000foo bar"]) | |
470 | assert expected.equals(result) | |
471 | ||
472 | arr = pa.array([" foo", None, " \u3000foo bar \t"]) | |
473 | result = pc.utf8_trim(arr, characters=' f\u3000') | |
474 | expected = pa.array(["oo", None, "oo bar \t"]) | |
475 | assert expected.equals(result) | |
476 | ||
477 | ||
478 | def test_slice_compatibility(): | |
479 | arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"]) | |
480 | for start in range(-6, 6): | |
481 | for stop in range(-6, 6): | |
482 | for step in [-3, -2, -1, 1, 2, 3]: | |
483 | expected = pa.array([k.as_py()[start:stop:step] | |
484 | for k in arr]) | |
485 | result = pc.utf8_slice_codeunits( | |
486 | arr, start=start, stop=stop, step=step) | |
487 | assert expected.equals(result) | |
488 | ||
489 | ||
490 | def test_split_pattern(): | |
491 | arr = pa.array(["-foo---bar--", "---foo---b"]) | |
492 | result = pc.split_pattern(arr, pattern="---") | |
493 | expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]]) | |
494 | assert expected.equals(result) | |
495 | ||
496 | result = pc.split_pattern(arr, pattern="---", max_splits=1) | |
497 | expected = pa.array([["-foo", "bar--"], ["", "foo---b"]]) | |
498 | assert expected.equals(result) | |
499 | ||
500 | result = pc.split_pattern(arr, pattern="---", max_splits=1, reverse=True) | |
501 | expected = pa.array([["-foo", "bar--"], ["---foo", "b"]]) | |
502 | assert expected.equals(result) | |
503 | ||
504 | ||
505 | def test_split_whitespace_utf8(): | |
506 | arr = pa.array(["foo bar", " foo \u3000\tb"]) | |
507 | result = pc.utf8_split_whitespace(arr) | |
508 | expected = pa.array([["foo", "bar"], ["", "foo", "b"]]) | |
509 | assert expected.equals(result) | |
510 | ||
511 | result = pc.utf8_split_whitespace(arr, max_splits=1) | |
512 | expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) | |
513 | assert expected.equals(result) | |
514 | ||
515 | result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True) | |
516 | expected = pa.array([["foo", "bar"], [" foo", "b"]]) | |
517 | assert expected.equals(result) | |
518 | ||
519 | ||
520 | def test_split_whitespace_ascii(): | |
521 | arr = pa.array(["foo bar", " foo \u3000\tb"]) | |
522 | result = pc.ascii_split_whitespace(arr) | |
523 | expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]]) | |
524 | assert expected.equals(result) | |
525 | ||
526 | result = pc.ascii_split_whitespace(arr, max_splits=1) | |
527 | expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) | |
528 | assert expected.equals(result) | |
529 | ||
530 | result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True) | |
531 | expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]]) | |
532 | assert expected.equals(result) | |
533 | ||
534 | ||
535 | def test_split_pattern_regex(): | |
536 | arr = pa.array(["-foo---bar--", "---foo---b"]) | |
537 | result = pc.split_pattern_regex(arr, pattern="-+") | |
538 | expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]]) | |
539 | assert expected.equals(result) | |
540 | ||
541 | result = pc.split_pattern_regex(arr, pattern="-+", max_splits=1) | |
542 | expected = pa.array([["", "foo---bar--"], ["", "foo---b"]]) | |
543 | assert expected.equals(result) | |
544 | ||
545 | with pytest.raises(NotImplementedError, | |
546 | match="Cannot split in reverse with regex"): | |
547 | result = pc.split_pattern_regex( | |
548 | arr, pattern="---", max_splits=1, reverse=True) | |
549 | ||
550 | ||
551 | def test_min_max(): | |
552 | # An example generated function wrapper with possible options | |
553 | data = [4, 5, 6, None, 1] | |
554 | s = pc.min_max(data) | |
555 | assert s.as_py() == {'min': 1, 'max': 6} | |
556 | s = pc.min_max(data, options=pc.ScalarAggregateOptions()) | |
557 | assert s.as_py() == {'min': 1, 'max': 6} | |
558 | s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) | |
559 | assert s.as_py() == {'min': 1, 'max': 6} | |
560 | s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) | |
561 | assert s.as_py() == {'min': None, 'max': None} | |
562 | ||
563 | # Options as dict of kwargs | |
564 | s = pc.min_max(data, options={'skip_nulls': False}) | |
565 | assert s.as_py() == {'min': None, 'max': None} | |
566 | # Options as named functions arguments | |
567 | s = pc.min_max(data, skip_nulls=False) | |
568 | assert s.as_py() == {'min': None, 'max': None} | |
569 | ||
570 | # Both options and named arguments | |
571 | with pytest.raises(TypeError): | |
572 | s = pc.min_max( | |
573 | data, options=pc.ScalarAggregateOptions(), skip_nulls=False) | |
574 | ||
575 | # Wrong options type | |
576 | options = pc.TakeOptions() | |
577 | with pytest.raises(TypeError): | |
578 | s = pc.min_max(data, options=options) | |
579 | ||
580 | # Missing argument | |
581 | with pytest.raises(ValueError, | |
582 | match="Function min_max accepts 1 argument"): | |
583 | s = pc.min_max() | |
584 | ||
585 | ||
586 | def test_any(): | |
587 | # ARROW-1846 | |
588 | ||
589 | options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) | |
590 | ||
591 | a = pa.array([], type='bool') | |
592 | assert pc.any(a).as_py() is None | |
593 | assert pc.any(a, min_count=0).as_py() is False | |
594 | assert pc.any(a, options=options).as_py() is False | |
595 | ||
596 | a = pa.array([False, None, True]) | |
597 | assert pc.any(a).as_py() is True | |
598 | assert pc.any(a, options=options).as_py() is True | |
599 | ||
600 | a = pa.array([False, None, False]) | |
601 | assert pc.any(a).as_py() is False | |
602 | assert pc.any(a, options=options).as_py() is None | |
603 | ||
604 | ||
605 | def test_all(): | |
606 | # ARROW-10301 | |
607 | ||
608 | options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) | |
609 | ||
610 | a = pa.array([], type='bool') | |
611 | assert pc.all(a).as_py() is None | |
612 | assert pc.all(a, min_count=0).as_py() is True | |
613 | assert pc.all(a, options=options).as_py() is True | |
614 | ||
615 | a = pa.array([False, True]) | |
616 | assert pc.all(a).as_py() is False | |
617 | assert pc.all(a, options=options).as_py() is False | |
618 | ||
619 | a = pa.array([True, None]) | |
620 | assert pc.all(a).as_py() is True | |
621 | assert pc.all(a, options=options).as_py() is None | |
622 | ||
623 | a = pa.chunked_array([[True], [True, None]]) | |
624 | assert pc.all(a).as_py() is True | |
625 | assert pc.all(a, options=options).as_py() is None | |
626 | ||
627 | a = pa.chunked_array([[True], [False]]) | |
628 | assert pc.all(a).as_py() is False | |
629 | assert pc.all(a, options=options).as_py() is False | |
630 | ||
631 | ||
632 | def test_is_valid(): | |
633 | # An example generated function wrapper without options | |
634 | data = [4, 5, None] | |
635 | assert pc.is_valid(data).to_pylist() == [True, True, False] | |
636 | ||
637 | with pytest.raises(TypeError): | |
638 | pc.is_valid(data, options=None) | |
639 | ||
640 | ||
641 | def test_generated_docstrings(): | |
642 | assert pc.min_max.__doc__ == textwrap.dedent("""\ | |
643 | Compute the minimum and maximum values of a numeric array. | |
644 | ||
645 | Null values are ignored by default. | |
646 | This can be changed through ScalarAggregateOptions. | |
647 | ||
648 | Parameters | |
649 | ---------- | |
650 | array : Array-like | |
651 | Argument to compute function | |
652 | memory_pool : pyarrow.MemoryPool, optional | |
653 | If not passed, will allocate memory from the default memory pool. | |
654 | options : pyarrow.compute.ScalarAggregateOptions, optional | |
655 | Parameters altering compute function semantics. | |
656 | skip_nulls : optional | |
657 | Parameter for ScalarAggregateOptions constructor. Either `options` | |
658 | or `skip_nulls` can be passed, but not both at the same time. | |
659 | min_count : optional | |
660 | Parameter for ScalarAggregateOptions constructor. Either `options` | |
661 | or `min_count` can be passed, but not both at the same time. | |
662 | """) | |
663 | assert pc.add.__doc__ == textwrap.dedent("""\ | |
664 | Add the arguments element-wise. | |
665 | ||
666 | Results will wrap around on integer overflow. | |
667 | Use function "add_checked" if you want overflow | |
668 | to return an error. | |
669 | ||
670 | Parameters | |
671 | ---------- | |
672 | x : Array-like or scalar-like | |
673 | Argument to compute function | |
674 | y : Array-like or scalar-like | |
675 | Argument to compute function | |
676 | memory_pool : pyarrow.MemoryPool, optional | |
677 | If not passed, will allocate memory from the default memory pool. | |
678 | """) | |
679 | ||
680 | ||
681 | def test_generated_signatures(): | |
682 | # The self-documentation provided by signatures should show acceptable | |
683 | # options and their default values. | |
684 | sig = inspect.signature(pc.add) | |
685 | assert str(sig) == "(x, y, *, memory_pool=None)" | |
686 | sig = inspect.signature(pc.min_max) | |
687 | assert str(sig) == ("(array, *, memory_pool=None, " | |
688 | "options=None, skip_nulls=True, min_count=1)") | |
689 | sig = inspect.signature(pc.quantile) | |
690 | assert str(sig) == ("(array, *, memory_pool=None, " | |
691 | "options=None, q=0.5, interpolation='linear', " | |
692 | "skip_nulls=True, min_count=0)") | |
693 | sig = inspect.signature(pc.binary_join_element_wise) | |
694 | assert str(sig) == ("(*strings, memory_pool=None, options=None, " | |
695 | "null_handling='emit_null', null_replacement='')") | |
696 | ||
697 | ||
698 | # We use isprintable to find about codepoints that Python doesn't know, but | |
699 | # utf8proc does (or in a future version of Python the other way around). | |
700 | # These codepoints cannot be compared between Arrow and the Python | |
701 | # implementation. | |
702 | @lru_cache() | |
703 | def find_new_unicode_codepoints(): | |
704 | new = set() | |
705 | characters = [chr(c) for c in range(0x80, 0x11000) | |
706 | if not (0xD800 <= c < 0xE000)] | |
707 | is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist() | |
708 | for i, c in enumerate(characters): | |
709 | if is_printable[i] != c.isprintable(): | |
710 | new.add(ord(c)) | |
711 | return new | |
712 | ||
713 | ||
714 | # Python claims there are not alpha, not sure why, they are in | |
715 | # gc='Other Letter': https://graphemica.com/%E1%B3%B2 | |
716 | unknown_issue_is_alpha = {0x1cf2, 0x1cf3} | |
717 | # utf8proc does not know if codepoints are lower case | |
718 | utf8proc_issue_is_lower = { | |
719 | 0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, | |
720 | 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, | |
721 | 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d, | |
722 | 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, | |
723 | 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, | |
724 | 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, | |
725 | 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, | |
726 | 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, | |
727 | 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, | |
728 | 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, | |
729 | 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, | |
730 | 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, | |
731 | 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, | |
732 | 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, | |
733 | 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, | |
734 | 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, | |
735 | 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, | |
736 | 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, | |
737 | 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, | |
738 | 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, | |
739 | 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, | |
740 | 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, | |
741 | 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, | |
742 | 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } | |
743 | # utf8proc does not store if a codepoint is numeric | |
744 | numeric_info_missing = { | |
745 | 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, | |
746 | 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, | |
747 | 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, | |
748 | 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, | |
749 | 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, | |
750 | 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, | |
751 | 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, | |
752 | 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, | |
753 | 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, | |
754 | 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, | |
755 | 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, | |
756 | 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, | |
757 | 0x10fcb, } | |
758 | # utf8proc has no no digit/numeric information | |
759 | digit_info_missing = { | |
760 | 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, | |
761 | 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, | |
762 | 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, | |
763 | 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, | |
764 | 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, | |
765 | 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, | |
766 | 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, | |
767 | 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f, | |
768 | 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, | |
769 | 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, | |
770 | 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, | |
771 | 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, | |
772 | 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, | |
773 | 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, | |
774 | 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, | |
775 | 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } | |
776 | numeric_info_missing = { | |
777 | 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, | |
778 | 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, | |
779 | 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, | |
780 | 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, | |
781 | 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, | |
782 | 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, | |
783 | 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, | |
784 | 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, | |
785 | 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, | |
786 | 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, | |
787 | 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, } | |
788 | ||
789 | codepoints_ignore = { | |
790 | 'is_alnum': numeric_info_missing | digit_info_missing | | |
791 | unknown_issue_is_alpha, | |
792 | 'is_alpha': unknown_issue_is_alpha, | |
793 | 'is_digit': digit_info_missing, | |
794 | 'is_numeric': numeric_info_missing, | |
795 | 'is_lower': utf8proc_issue_is_lower | |
796 | } | |
797 | ||
798 | ||
799 | @pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha', | |
800 | 'is_ascii', 'is_decimal', | |
801 | 'is_digit', 'is_lower', | |
802 | 'is_numeric', 'is_printable', | |
803 | 'is_space', 'is_upper', ]) | |
804 | @pytest.mark.parametrize('variant', ['ascii', 'utf8']) | |
805 | def test_string_py_compat_boolean(function_name, variant): | |
806 | arrow_name = variant + "_" + function_name | |
807 | py_name = function_name.replace('_', '') | |
808 | ignore = codepoints_ignore.get(function_name, set()) | \ | |
809 | find_new_unicode_codepoints() | |
810 | for i in range(128 if ascii else 0x11000): | |
811 | if i in range(0xD800, 0xE000): | |
812 | continue # bug? pyarrow doesn't allow utf16 surrogates | |
813 | # the issues we know of, we skip | |
814 | if i in ignore: | |
815 | continue | |
816 | # Compare results with the equivalent Python predicate | |
817 | # (except "is_space" where functions are known to be incompatible) | |
818 | c = chr(i) | |
819 | if hasattr(pc, arrow_name) and function_name != 'is_space': | |
820 | ar = pa.array([c]) | |
821 | arrow_func = getattr(pc, arrow_name) | |
822 | assert arrow_func(ar)[0].as_py() == getattr(c, py_name)() | |
823 | ||
824 | ||
825 | def test_pad(): | |
826 | arr = pa.array([None, 'a', 'abcd']) | |
827 | assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] | |
828 | assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] | |
829 | assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] | |
830 | ||
831 | arr = pa.array([None, 'á', 'abcd']) | |
832 | assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] | |
833 | assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] | |
834 | assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] | |
835 | ||
836 | ||
837 | @pytest.mark.pandas | |
838 | def test_replace_slice(): | |
839 | offsets = range(-3, 4) | |
840 | ||
841 | arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) | |
842 | series = arr.to_pandas() | |
843 | for start in offsets: | |
844 | for stop in offsets: | |
845 | expected = series.str.slice_replace(start, stop, 'XX') | |
846 | actual = pc.binary_replace_slice( | |
847 | arr, start=start, stop=stop, replacement='XX') | |
848 | assert actual.tolist() == expected.tolist() | |
849 | ||
850 | arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) | |
851 | series = arr.to_pandas() | |
852 | for start in offsets: | |
853 | for stop in offsets: | |
854 | expected = series.str.slice_replace(start, stop, 'XX') | |
855 | actual = pc.utf8_replace_slice( | |
856 | arr, start=start, stop=stop, replacement='XX') | |
857 | assert actual.tolist() == expected.tolist() | |
858 | ||
859 | ||
860 | def test_replace_plain(): | |
861 | ar = pa.array(['foo', 'food', None]) | |
862 | ar = pc.replace_substring(ar, pattern='foo', replacement='bar') | |
863 | assert ar.tolist() == ['bar', 'bard', None] | |
864 | ||
865 | ||
866 | def test_replace_regex(): | |
867 | ar = pa.array(['foo', 'mood', None]) | |
868 | ar = pc.replace_substring_regex(ar, pattern='(.)oo', replacement=r'\100') | |
869 | assert ar.tolist() == ['f00', 'm00d', None] | |
870 | ||
871 | ||
872 | def test_extract_regex(): | |
873 | ar = pa.array(['a1', 'zb2z']) | |
874 | struct = pc.extract_regex(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d)') | |
875 | assert struct.tolist() == [{'letter': 'a', 'digit': '1'}, { | |
876 | 'letter': 'b', 'digit': '2'}] | |
877 | ||
878 | ||
879 | def test_binary_join(): | |
880 | ar_list = pa.array([['foo', 'bar'], None, []]) | |
881 | expected = pa.array(['foo-bar', None, '']) | |
882 | assert pc.binary_join(ar_list, '-').equals(expected) | |
883 | ||
884 | separator_array = pa.array(['1', '2'], type=pa.binary()) | |
885 | expected = pa.array(['a1b', 'c2d'], type=pa.binary()) | |
886 | ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) | |
887 | assert pc.binary_join(ar_list, separator_array).equals(expected) | |
888 | ||
889 | ||
890 | def test_binary_join_element_wise(): | |
891 | null = pa.scalar(None, type=pa.string()) | |
892 | arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']] | |
893 | assert pc.binary_join_element_wise(*arrs).to_pylist() == \ | |
894 | [None, None, 'b--d'] | |
895 | assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' | |
896 | assert pc.binary_join_element_wise('a', null, '-').as_py() is None | |
897 | assert pc.binary_join_element_wise('a', 'b', null).as_py() is None | |
898 | ||
899 | skip = pc.JoinOptions(null_handling='skip') | |
900 | assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \ | |
901 | [None, 'a', 'b--d'] | |
902 | assert pc.binary_join_element_wise( | |
903 | 'a', 'b', '-', options=skip).as_py() == 'a-b' | |
904 | assert pc.binary_join_element_wise( | |
905 | 'a', null, '-', options=skip).as_py() == 'a' | |
906 | assert pc.binary_join_element_wise( | |
907 | 'a', 'b', null, options=skip).as_py() is None | |
908 | ||
909 | replace = pc.JoinOptions(null_handling='replace', null_replacement='spam') | |
910 | assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \ | |
911 | [None, 'a-spam', 'b--d'] | |
912 | assert pc.binary_join_element_wise( | |
913 | 'a', 'b', '-', options=replace).as_py() == 'a-b' | |
914 | assert pc.binary_join_element_wise( | |
915 | 'a', null, '-', options=replace).as_py() == 'a-spam' | |
916 | assert pc.binary_join_element_wise( | |
917 | 'a', 'b', null, options=replace).as_py() is None | |
918 | ||
919 | ||
920 | @pytest.mark.parametrize(('ty', 'values'), all_array_types) | |
921 | def test_take(ty, values): | |
922 | arr = pa.array(values, type=ty) | |
923 | for indices_type in [pa.int8(), pa.int64()]: | |
924 | indices = pa.array([0, 4, 2, None], type=indices_type) | |
925 | result = arr.take(indices) | |
926 | result.validate() | |
927 | expected = pa.array([values[0], values[4], values[2], None], type=ty) | |
928 | assert result.equals(expected) | |
929 | ||
930 | # empty indices | |
931 | indices = pa.array([], type=indices_type) | |
932 | result = arr.take(indices) | |
933 | result.validate() | |
934 | expected = pa.array([], type=ty) | |
935 | assert result.equals(expected) | |
936 | ||
937 | indices = pa.array([2, 5]) | |
938 | with pytest.raises(IndexError): | |
939 | arr.take(indices) | |
940 | ||
941 | indices = pa.array([2, -1]) | |
942 | with pytest.raises(IndexError): | |
943 | arr.take(indices) | |
944 | ||
945 | ||
946 | def test_take_indices_types(): | |
947 | arr = pa.array(range(5)) | |
948 | ||
949 | for indices_type in ['uint8', 'int8', 'uint16', 'int16', | |
950 | 'uint32', 'int32', 'uint64', 'int64']: | |
951 | indices = pa.array([0, 4, 2, None], type=indices_type) | |
952 | result = arr.take(indices) | |
953 | result.validate() | |
954 | expected = pa.array([0, 4, 2, None]) | |
955 | assert result.equals(expected) | |
956 | ||
957 | for indices_type in [pa.float32(), pa.float64()]: | |
958 | indices = pa.array([0, 4, 2], type=indices_type) | |
959 | with pytest.raises(NotImplementedError): | |
960 | arr.take(indices) | |
961 | ||
962 | ||
963 | def test_take_on_chunked_array(): | |
964 | # ARROW-9504 | |
965 | arr = pa.chunked_array([ | |
966 | [ | |
967 | "a", | |
968 | "b", | |
969 | "c", | |
970 | "d", | |
971 | "e" | |
972 | ], | |
973 | [ | |
974 | "f", | |
975 | "g", | |
976 | "h", | |
977 | "i", | |
978 | "j" | |
979 | ] | |
980 | ]) | |
981 | ||
982 | indices = np.array([0, 5, 1, 6, 9, 2]) | |
983 | result = arr.take(indices) | |
984 | expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]]) | |
985 | assert result.equals(expected) | |
986 | ||
987 | indices = pa.chunked_array([[1], [9, 2]]) | |
988 | result = arr.take(indices) | |
989 | expected = pa.chunked_array([ | |
990 | [ | |
991 | "b" | |
992 | ], | |
993 | [ | |
994 | "j", | |
995 | "c" | |
996 | ] | |
997 | ]) | |
998 | assert result.equals(expected) | |
999 | ||
1000 | ||
1001 | @pytest.mark.parametrize('ordered', [False, True]) | |
1002 | def test_take_dictionary(ordered): | |
1003 | arr = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], | |
1004 | ordered=ordered) | |
1005 | result = arr.take(pa.array([0, 1, 3])) | |
1006 | result.validate() | |
1007 | assert result.to_pylist() == ['a', 'b', 'a'] | |
1008 | assert result.dictionary.to_pylist() == ['a', 'b', 'c'] | |
1009 | assert result.type.ordered is ordered | |
1010 | ||
1011 | ||
1012 | def test_take_null_type(): | |
1013 | # ARROW-10027 | |
1014 | arr = pa.array([None] * 10) | |
1015 | chunked_arr = pa.chunked_array([[None] * 5] * 2) | |
1016 | batch = pa.record_batch([arr], names=['a']) | |
1017 | table = pa.table({'a': arr}) | |
1018 | ||
1019 | indices = pa.array([1, 3, 7, None]) | |
1020 | assert len(arr.take(indices)) == 4 | |
1021 | assert len(chunked_arr.take(indices)) == 4 | |
1022 | assert len(batch.take(indices).column(0)) == 4 | |
1023 | assert len(table.take(indices).column(0)) == 4 | |
1024 | ||
1025 | ||
1026 | @pytest.mark.parametrize(('ty', 'values'), all_array_types) | |
1027 | def test_drop_null(ty, values): | |
1028 | arr = pa.array(values, type=ty) | |
1029 | result = arr.drop_null() | |
1030 | result.validate(full=True) | |
1031 | indices = [i for i in range(len(arr)) if arr[i].is_valid] | |
1032 | expected = arr.take(pa.array(indices)) | |
1033 | assert result.equals(expected) | |
1034 | ||
1035 | ||
1036 | def test_drop_null_chunked_array(): | |
1037 | arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []]) | |
1038 | expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []]) | |
1039 | ||
1040 | result = arr.drop_null() | |
1041 | assert result.equals(expected_drop) | |
1042 | ||
1043 | ||
1044 | def test_drop_null_record_batch(): | |
1045 | batch = pa.record_batch( | |
1046 | [pa.array(["a", None, "c", "d", None])], names=["a'"]) | |
1047 | result = batch.drop_null() | |
1048 | expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"]) | |
1049 | assert result.equals(expected) | |
1050 | ||
1051 | batch = pa.record_batch( | |
1052 | [pa.array(["a", None, "c", "d", None]), | |
1053 | pa.array([None, None, "c", None, "e"])], names=["a'", "b'"]) | |
1054 | ||
1055 | result = batch.drop_null() | |
1056 | expected = pa.record_batch( | |
1057 | [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"]) | |
1058 | assert result.equals(expected) | |
1059 | ||
1060 | ||
1061 | def test_drop_null_table(): | |
1062 | table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"]) | |
1063 | expected = pa.table([pa.array(["a", "c", "d"])], names=["a"]) | |
1064 | result = table.drop_null() | |
1065 | assert result.equals(expected) | |
1066 | ||
1067 | table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]), | |
1068 | pa.chunked_array([["a", None], [None, "d", None]]), | |
1069 | pa.chunked_array([["a"], ["b"], [None], ["d", None]])], | |
1070 | names=["a", "b", "c"]) | |
1071 | expected = pa.table([pa.array(["a", "d"]), | |
1072 | pa.array(["a", "d"]), | |
1073 | pa.array(["a", "d"])], | |
1074 | names=["a", "b", "c"]) | |
1075 | result = table.drop_null() | |
1076 | assert result.equals(expected) | |
1077 | ||
1078 | table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]), | |
1079 | pa.chunked_array([["A"], ["B"], [None], ["D", None]]), | |
1080 | pa.chunked_array([["a`", None], ["c`", "d`", None]])], | |
1081 | names=["a", "b", "c"]) | |
1082 | expected = pa.table([pa.array(["a", "d"]), | |
1083 | pa.array(["A", "D"]), | |
1084 | pa.array(["a`", "d`"])], | |
1085 | names=["a", "b", "c"]) | |
1086 | result = table.drop_null() | |
1087 | assert result.equals(expected) | |
1088 | ||
1089 | ||
1090 | def test_drop_null_null_type(): | |
1091 | arr = pa.array([None] * 10) | |
1092 | chunked_arr = pa.chunked_array([[None] * 5] * 2) | |
1093 | batch = pa.record_batch([arr], names=['a']) | |
1094 | table = pa.table({'a': arr}) | |
1095 | ||
1096 | assert len(arr.drop_null()) == 0 | |
1097 | assert len(chunked_arr.drop_null()) == 0 | |
1098 | assert len(batch.drop_null().column(0)) == 0 | |
1099 | assert len(table.drop_null().column(0)) == 0 | |
1100 | ||
1101 | ||
1102 | @pytest.mark.parametrize(('ty', 'values'), all_array_types) | |
1103 | def test_filter(ty, values): | |
1104 | arr = pa.array(values, type=ty) | |
1105 | ||
1106 | mask = pa.array([True, False, False, True, None]) | |
1107 | result = arr.filter(mask, null_selection_behavior='drop') | |
1108 | result.validate() | |
1109 | assert result.equals(pa.array([values[0], values[3]], type=ty)) | |
1110 | result = arr.filter(mask, null_selection_behavior='emit_null') | |
1111 | result.validate() | |
1112 | assert result.equals(pa.array([values[0], values[3], None], type=ty)) | |
1113 | ||
1114 | # non-boolean dtype | |
1115 | mask = pa.array([0, 1, 0, 1, 0]) | |
1116 | with pytest.raises(NotImplementedError): | |
1117 | arr.filter(mask) | |
1118 | ||
1119 | # wrong length | |
1120 | mask = pa.array([True, False, True]) | |
1121 | with pytest.raises(ValueError, match="must all be the same length"): | |
1122 | arr.filter(mask) | |
1123 | ||
1124 | ||
1125 | def test_filter_chunked_array(): | |
1126 | arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) | |
1127 | expected_drop = pa.chunked_array([["a"], ["e"]]) | |
1128 | expected_null = pa.chunked_array([["a"], [None, "e"]]) | |
1129 | ||
1130 | for mask in [ | |
1131 | # mask is array | |
1132 | pa.array([True, False, None, False, True]), | |
1133 | # mask is chunked array | |
1134 | pa.chunked_array([[True, False, None], [False, True]]), | |
1135 | # mask is python object | |
1136 | [True, False, None, False, True] | |
1137 | ]: | |
1138 | result = arr.filter(mask) | |
1139 | assert result.equals(expected_drop) | |
1140 | result = arr.filter(mask, null_selection_behavior="emit_null") | |
1141 | assert result.equals(expected_null) | |
1142 | ||
1143 | ||
1144 | def test_filter_record_batch(): | |
1145 | batch = pa.record_batch( | |
1146 | [pa.array(["a", None, "c", "d", "e"])], names=["a'"]) | |
1147 | ||
1148 | # mask is array | |
1149 | mask = pa.array([True, False, None, False, True]) | |
1150 | result = batch.filter(mask) | |
1151 | expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"]) | |
1152 | assert result.equals(expected) | |
1153 | ||
1154 | result = batch.filter(mask, null_selection_behavior="emit_null") | |
1155 | expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"]) | |
1156 | assert result.equals(expected) | |
1157 | ||
1158 | ||
1159 | def test_filter_table(): | |
1160 | table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"]) | |
1161 | expected_drop = pa.table([pa.array(["a", "e"])], names=["a"]) | |
1162 | expected_null = pa.table([pa.array(["a", None, "e"])], names=["a"]) | |
1163 | ||
1164 | for mask in [ | |
1165 | # mask is array | |
1166 | pa.array([True, False, None, False, True]), | |
1167 | # mask is chunked array | |
1168 | pa.chunked_array([[True, False], [None, False, True]]), | |
1169 | # mask is python object | |
1170 | [True, False, None, False, True] | |
1171 | ]: | |
1172 | result = table.filter(mask) | |
1173 | assert result.equals(expected_drop) | |
1174 | result = table.filter(mask, null_selection_behavior="emit_null") | |
1175 | assert result.equals(expected_null) | |
1176 | ||
1177 | ||
1178 | def test_filter_errors(): | |
1179 | arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) | |
1180 | batch = pa.record_batch( | |
1181 | [pa.array(["a", None, "c", "d", "e"])], names=["a'"]) | |
1182 | table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"]) | |
1183 | ||
1184 | for obj in [arr, batch, table]: | |
1185 | # non-boolean dtype | |
1186 | mask = pa.array([0, 1, 0, 1, 0]) | |
1187 | with pytest.raises(NotImplementedError): | |
1188 | obj.filter(mask) | |
1189 | ||
1190 | # wrong length | |
1191 | mask = pa.array([True, False, True]) | |
1192 | with pytest.raises(pa.ArrowInvalid, | |
1193 | match="must all be the same length"): | |
1194 | obj.filter(mask) | |
1195 | ||
1196 | ||
1197 | def test_filter_null_type(): | |
1198 | # ARROW-10027 | |
1199 | arr = pa.array([None] * 10) | |
1200 | chunked_arr = pa.chunked_array([[None] * 5] * 2) | |
1201 | batch = pa.record_batch([arr], names=['a']) | |
1202 | table = pa.table({'a': arr}) | |
1203 | ||
1204 | mask = pa.array([True, False] * 5) | |
1205 | assert len(arr.filter(mask)) == 5 | |
1206 | assert len(chunked_arr.filter(mask)) == 5 | |
1207 | assert len(batch.filter(mask).column(0)) == 5 | |
1208 | assert len(table.filter(mask).column(0)) == 5 | |
1209 | ||
1210 | ||
1211 | @pytest.mark.parametrize("typ", ["array", "chunked_array"]) | |
1212 | def test_compare_array(typ): | |
1213 | if typ == "array": | |
1214 | def con(values): | |
1215 | return pa.array(values) | |
1216 | else: | |
1217 | def con(values): | |
1218 | return pa.chunked_array([values]) | |
1219 | ||
1220 | arr1 = con([1, 2, 3, 4, None]) | |
1221 | arr2 = con([1, 1, 4, None, 4]) | |
1222 | ||
1223 | result = pc.equal(arr1, arr2) | |
1224 | assert result.equals(con([True, False, False, None, None])) | |
1225 | ||
1226 | result = pc.not_equal(arr1, arr2) | |
1227 | assert result.equals(con([False, True, True, None, None])) | |
1228 | ||
1229 | result = pc.less(arr1, arr2) | |
1230 | assert result.equals(con([False, False, True, None, None])) | |
1231 | ||
1232 | result = pc.less_equal(arr1, arr2) | |
1233 | assert result.equals(con([True, False, True, None, None])) | |
1234 | ||
1235 | result = pc.greater(arr1, arr2) | |
1236 | assert result.equals(con([False, True, False, None, None])) | |
1237 | ||
1238 | result = pc.greater_equal(arr1, arr2) | |
1239 | assert result.equals(con([True, True, False, None, None])) | |
1240 | ||
1241 | ||
1242 | @pytest.mark.parametrize("typ", ["array", "chunked_array"]) | |
1243 | def test_compare_string_scalar(typ): | |
1244 | if typ == "array": | |
1245 | def con(values): | |
1246 | return pa.array(values) | |
1247 | else: | |
1248 | def con(values): | |
1249 | return pa.chunked_array([values]) | |
1250 | ||
1251 | arr = con(['a', 'b', 'c', None]) | |
1252 | scalar = pa.scalar('b') | |
1253 | ||
1254 | result = pc.equal(arr, scalar) | |
1255 | assert result.equals(con([False, True, False, None])) | |
1256 | ||
1257 | if typ == "array": | |
1258 | nascalar = pa.scalar(None, type="string") | |
1259 | result = pc.equal(arr, nascalar) | |
1260 | isnull = pc.is_null(result) | |
1261 | assert isnull.equals(con([True, True, True, True])) | |
1262 | ||
1263 | result = pc.not_equal(arr, scalar) | |
1264 | assert result.equals(con([True, False, True, None])) | |
1265 | ||
1266 | result = pc.less(arr, scalar) | |
1267 | assert result.equals(con([True, False, False, None])) | |
1268 | ||
1269 | result = pc.less_equal(arr, scalar) | |
1270 | assert result.equals(con([True, True, False, None])) | |
1271 | ||
1272 | result = pc.greater(arr, scalar) | |
1273 | assert result.equals(con([False, False, True, None])) | |
1274 | ||
1275 | result = pc.greater_equal(arr, scalar) | |
1276 | assert result.equals(con([False, True, True, None])) | |
1277 | ||
1278 | ||
1279 | @pytest.mark.parametrize("typ", ["array", "chunked_array"]) | |
1280 | def test_compare_scalar(typ): | |
1281 | if typ == "array": | |
1282 | def con(values): | |
1283 | return pa.array(values) | |
1284 | else: | |
1285 | def con(values): | |
1286 | return pa.chunked_array([values]) | |
1287 | ||
1288 | arr = con([1, 2, 3, None]) | |
1289 | scalar = pa.scalar(2) | |
1290 | ||
1291 | result = pc.equal(arr, scalar) | |
1292 | assert result.equals(con([False, True, False, None])) | |
1293 | ||
1294 | if typ == "array": | |
1295 | nascalar = pa.scalar(None, type="int64") | |
1296 | result = pc.equal(arr, nascalar) | |
1297 | assert result.to_pylist() == [None, None, None, None] | |
1298 | ||
1299 | result = pc.not_equal(arr, scalar) | |
1300 | assert result.equals(con([True, False, True, None])) | |
1301 | ||
1302 | result = pc.less(arr, scalar) | |
1303 | assert result.equals(con([True, False, False, None])) | |
1304 | ||
1305 | result = pc.less_equal(arr, scalar) | |
1306 | assert result.equals(con([True, True, False, None])) | |
1307 | ||
1308 | result = pc.greater(arr, scalar) | |
1309 | assert result.equals(con([False, False, True, None])) | |
1310 | ||
1311 | result = pc.greater_equal(arr, scalar) | |
1312 | assert result.equals(con([False, True, True, None])) | |
1313 | ||
1314 | ||
1315 | def test_compare_chunked_array_mixed(): | |
1316 | arr = pa.array([1, 2, 3, 4, None]) | |
1317 | arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]]) | |
1318 | arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]]) | |
1319 | ||
1320 | expected = pa.chunked_array([[True, True, True, True, None]]) | |
1321 | ||
1322 | for left, right in [ | |
1323 | (arr, arr_chunked), | |
1324 | (arr_chunked, arr), | |
1325 | (arr_chunked, arr_chunked2), | |
1326 | ]: | |
1327 | result = pc.equal(left, right) | |
1328 | assert result.equals(expected) | |
1329 | ||
1330 | ||
1331 | def test_arithmetic_add(): | |
1332 | left = pa.array([1, 2, 3, 4, 5]) | |
1333 | right = pa.array([0, -1, 1, 2, 3]) | |
1334 | result = pc.add(left, right) | |
1335 | expected = pa.array([1, 1, 4, 6, 8]) | |
1336 | assert result.equals(expected) | |
1337 | ||
1338 | ||
1339 | def test_arithmetic_subtract(): | |
1340 | left = pa.array([1, 2, 3, 4, 5]) | |
1341 | right = pa.array([0, -1, 1, 2, 3]) | |
1342 | result = pc.subtract(left, right) | |
1343 | expected = pa.array([1, 3, 2, 2, 2]) | |
1344 | assert result.equals(expected) | |
1345 | ||
1346 | ||
1347 | def test_arithmetic_multiply(): | |
1348 | left = pa.array([1, 2, 3, 4, 5]) | |
1349 | right = pa.array([0, -1, 1, 2, 3]) | |
1350 | result = pc.multiply(left, right) | |
1351 | expected = pa.array([0, -2, 3, 8, 15]) | |
1352 | assert result.equals(expected) | |
1353 | ||
1354 | ||
1355 | @pytest.mark.parametrize("ty", ["round", "round_to_multiple"]) | |
1356 | def test_round_to_integer(ty): | |
1357 | if ty == "round": | |
1358 | round = pc.round | |
1359 | RoundOptions = partial(pc.RoundOptions, ndigits=0) | |
1360 | elif ty == "round_to_multiple": | |
1361 | round = pc.round_to_multiple | |
1362 | RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1) | |
1363 | ||
1364 | values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None] | |
1365 | rmode_and_expected = { | |
1366 | "down": [3, 3, 3, 4, -4, -4, -4, None], | |
1367 | "up": [4, 4, 4, 5, -3, -3, -3, None], | |
1368 | "towards_zero": [3, 3, 3, 4, -3, -3, -3, None], | |
1369 | "towards_infinity": [4, 4, 4, 5, -4, -4, -4, None], | |
1370 | "half_down": [3, 3, 4, 4, -3, -4, -4, None], | |
1371 | "half_up": [3, 4, 4, 5, -3, -3, -4, None], | |
1372 | "half_towards_zero": [3, 3, 4, 4, -3, -3, -4, None], | |
1373 | "half_towards_infinity": [3, 4, 4, 5, -3, -4, -4, None], | |
1374 | "half_to_even": [3, 4, 4, 4, -3, -4, -4, None], | |
1375 | "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], | |
1376 | } | |
1377 | for round_mode, expected in rmode_and_expected.items(): | |
1378 | options = RoundOptions(round_mode=round_mode) | |
1379 | result = round(values, options=options) | |
1380 | np.testing.assert_array_equal(result, pa.array(expected)) | |
1381 | ||
1382 | ||
1383 | def test_round(): | |
1384 | values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] | |
1385 | ndigits_and_expected = { | |
1386 | -2: [300, 0, 0, 0, -0, -0, -0, None], | |
1387 | -1: [320, 0, 0, 0, -0, -40, -0, None], | |
1388 | 0: [320, 4, 3, 5, -3, -35, -3, None], | |
1389 | 1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None], | |
1390 | 2: [320, 3.5, 3.08, 4.5, -3.21, -35.12, -3.05, None], | |
1391 | } | |
1392 | for ndigits, expected in ndigits_and_expected.items(): | |
1393 | options = pc.RoundOptions(ndigits, "half_towards_infinity") | |
1394 | result = pc.round(values, options=options) | |
1395 | np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) | |
1396 | ||
1397 | ||
1398 | def test_round_to_multiple(): | |
1399 | values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] | |
1400 | multiple_and_expected = { | |
1401 | 2: [320, 4, 4, 4, -4, -36, -4, None], | |
1402 | 0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None], | |
1403 | 0.1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None], | |
1404 | 10: [320, 0, 0, 0, -0, -40, -0, None], | |
1405 | 100: [300, 0, 0, 0, -0, -0, -0, None], | |
1406 | } | |
1407 | for multiple, expected in multiple_and_expected.items(): | |
1408 | options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") | |
1409 | result = pc.round_to_multiple(values, options=options) | |
1410 | np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) | |
1411 | ||
1412 | with pytest.raises(pa.ArrowInvalid, match="multiple must be positive"): | |
1413 | pc.round_to_multiple(values, multiple=-2) | |
1414 | ||
1415 | ||
1416 | def test_is_null(): | |
1417 | arr = pa.array([1, 2, 3, None]) | |
1418 | result = arr.is_null() | |
1419 | expected = pa.array([False, False, False, True]) | |
1420 | assert result.equals(expected) | |
1421 | assert result.equals(pc.is_null(arr)) | |
1422 | result = arr.is_valid() | |
1423 | expected = pa.array([True, True, True, False]) | |
1424 | assert result.equals(expected) | |
1425 | assert result.equals(pc.is_valid(arr)) | |
1426 | ||
1427 | arr = pa.chunked_array([[1, 2], [3, None]]) | |
1428 | result = arr.is_null() | |
1429 | expected = pa.chunked_array([[False, False], [False, True]]) | |
1430 | assert result.equals(expected) | |
1431 | result = arr.is_valid() | |
1432 | expected = pa.chunked_array([[True, True], [True, False]]) | |
1433 | assert result.equals(expected) | |
1434 | ||
1435 | arr = pa.array([1, 2, 3, None, np.nan]) | |
1436 | result = arr.is_null() | |
1437 | expected = pa.array([False, False, False, True, False]) | |
1438 | assert result.equals(expected) | |
1439 | ||
1440 | result = arr.is_null(nan_is_null=True) | |
1441 | expected = pa.array([False, False, False, True, True]) | |
1442 | assert result.equals(expected) | |
1443 | ||
1444 | ||
1445 | def test_fill_null(): | |
1446 | arr = pa.array([1, 2, None, 4], type=pa.int8()) | |
1447 | fill_value = pa.array([5], type=pa.int8()) | |
1448 | with pytest.raises(pa.ArrowInvalid, | |
1449 | match="Array arguments must all be the same length"): | |
1450 | arr.fill_null(fill_value) | |
1451 | ||
1452 | arr = pa.array([None, None, None, None], type=pa.null()) | |
1453 | fill_value = pa.scalar(None, type=pa.null()) | |
1454 | result = arr.fill_null(fill_value) | |
1455 | expected = pa.array([None, None, None, None]) | |
1456 | assert result.equals(expected) | |
1457 | ||
1458 | arr = pa.array(['a', 'bb', None]) | |
1459 | result = arr.fill_null('ccc') | |
1460 | expected = pa.array(['a', 'bb', 'ccc']) | |
1461 | assert result.equals(expected) | |
1462 | ||
1463 | arr = pa.array([b'a', b'bb', None], type=pa.large_binary()) | |
1464 | result = arr.fill_null('ccc') | |
1465 | expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary()) | |
1466 | assert result.equals(expected) | |
1467 | ||
1468 | arr = pa.array(['a', 'bb', None]) | |
1469 | result = arr.fill_null(None) | |
1470 | expected = pa.array(['a', 'bb', None]) | |
1471 | assert result.equals(expected) | |
1472 | ||
1473 | ||
1474 | @pytest.mark.parametrize('arrow_type', numerical_arrow_types) | |
1475 | def test_fill_null_array(arrow_type): | |
1476 | arr = pa.array([1, 2, None, 4], type=arrow_type) | |
1477 | fill_value = pa.scalar(5, type=arrow_type) | |
1478 | result = arr.fill_null(fill_value) | |
1479 | expected = pa.array([1, 2, 5, 4], type=arrow_type) | |
1480 | assert result.equals(expected) | |
1481 | ||
1482 | # Implicit conversions | |
1483 | result = arr.fill_null(5) | |
1484 | assert result.equals(expected) | |
1485 | ||
1486 | # ARROW-9451: Unsigned integers allow this for some reason | |
1487 | if not pa.types.is_unsigned_integer(arr.type): | |
1488 | with pytest.raises((ValueError, TypeError)): | |
1489 | arr.fill_null('5') | |
1490 | ||
1491 | result = arr.fill_null(pa.scalar(5, type='int8')) | |
1492 | assert result.equals(expected) | |
1493 | ||
1494 | ||
1495 | @pytest.mark.parametrize('arrow_type', numerical_arrow_types) | |
1496 | def test_fill_null_chunked_array(arrow_type): | |
1497 | fill_value = pa.scalar(5, type=arrow_type) | |
1498 | arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)]) | |
1499 | result = arr.fill_null(fill_value) | |
1500 | expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)]) | |
1501 | assert result.equals(expected) | |
1502 | ||
1503 | arr = pa.chunked_array([ | |
1504 | pa.array([1, 2], type=arrow_type), | |
1505 | pa.array([], type=arrow_type), | |
1506 | pa.array([None, 4], type=arrow_type) | |
1507 | ]) | |
1508 | expected = pa.chunked_array([ | |
1509 | pa.array([1, 2], type=arrow_type), | |
1510 | pa.array([], type=arrow_type), | |
1511 | pa.array([5, 4], type=arrow_type) | |
1512 | ]) | |
1513 | result = arr.fill_null(fill_value) | |
1514 | assert result.equals(expected) | |
1515 | ||
1516 | # Implicit conversions | |
1517 | result = arr.fill_null(5) | |
1518 | assert result.equals(expected) | |
1519 | ||
1520 | result = arr.fill_null(pa.scalar(5, type='int8')) | |
1521 | assert result.equals(expected) | |
1522 | ||
1523 | ||
1524 | def test_logical(): | |
1525 | a = pa.array([True, False, False, None]) | |
1526 | b = pa.array([True, True, False, True]) | |
1527 | ||
1528 | assert pc.and_(a, b) == pa.array([True, False, False, None]) | |
1529 | assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) | |
1530 | ||
1531 | assert pc.or_(a, b) == pa.array([True, True, False, None]) | |
1532 | assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) | |
1533 | ||
1534 | assert pc.xor(a, b) == pa.array([False, True, False, None]) | |
1535 | ||
1536 | assert pc.invert(a) == pa.array([False, True, True, None]) | |
1537 | ||
1538 | ||
1539 | def test_cast(): | |
1540 | arr = pa.array([2 ** 63 - 1], type='int64') | |
1541 | ||
1542 | with pytest.raises(pa.ArrowInvalid): | |
1543 | pc.cast(arr, 'int32') | |
1544 | ||
1545 | assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32') | |
1546 | ||
1547 | arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) | |
1548 | expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]') | |
1549 | assert pc.cast(arr, 'timestamp[ms]') == expected | |
1550 | ||
1551 | arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8())) | |
1552 | expected = pa.array([["1", "2"], ["3", "4", "5"]], | |
1553 | type=pa.list_(pa.utf8())) | |
1554 | assert pc.cast(arr, expected.type) == expected | |
1555 | ||
1556 | ||
1557 | def test_strptime(): | |
1558 | arr = pa.array(["5/1/2020", None, "12/13/1900"]) | |
1559 | ||
1560 | got = pc.strptime(arr, format='%m/%d/%Y', unit='s') | |
1561 | expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)], | |
1562 | type=pa.timestamp('s')) | |
1563 | assert got == expected | |
1564 | ||
1565 | ||
1566 | # TODO: We should test on windows once ARROW-13168 is resolved. | |
1567 | @pytest.mark.pandas | |
1568 | @pytest.mark.skipif(sys.platform == 'win32', | |
1569 | reason="Timezone database is not available on Windows yet") | |
1570 | def test_strftime(): | |
1571 | from pyarrow.vendored.version import Version | |
1572 | ||
1573 | def _fix_timestamp(s): | |
1574 | if Version(pd.__version__) < Version("1.0.0"): | |
1575 | return s.to_series().replace("NaT", pd.NaT) | |
1576 | else: | |
1577 | return s | |
1578 | ||
1579 | times = ["2018-03-10 09:00", "2038-01-31 12:23", None] | |
1580 | timezones = ["CET", "UTC", "Europe/Ljubljana"] | |
1581 | ||
1582 | formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", | |
1583 | "%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x", | |
1584 | "%X", "%%", "%G", "%V", "%u"] | |
1585 | ||
1586 | for timezone in timezones: | |
1587 | ts = pd.to_datetime(times).tz_localize(timezone) | |
1588 | for unit in ["s", "ms", "us", "ns"]: | |
1589 | tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) | |
1590 | for fmt in formats: | |
1591 | options = pc.StrftimeOptions(fmt) | |
1592 | result = pc.strftime(tsa, options=options) | |
1593 | expected = pa.array(_fix_timestamp(ts.strftime(fmt))) | |
1594 | assert result.equals(expected) | |
1595 | ||
1596 | fmt = "%Y-%m-%dT%H:%M:%S" | |
1597 | ||
1598 | # Default format | |
1599 | tsa = pa.array(ts, type=pa.timestamp("s", timezone)) | |
1600 | result = pc.strftime(tsa, options=pc.StrftimeOptions()) | |
1601 | expected = pa.array(_fix_timestamp(ts.strftime(fmt))) | |
1602 | assert result.equals(expected) | |
1603 | ||
1604 | # Default format plus timezone | |
1605 | tsa = pa.array(ts, type=pa.timestamp("s", timezone)) | |
1606 | result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) | |
1607 | expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z"))) | |
1608 | assert result.equals(expected) | |
1609 | ||
1610 | # Pandas %S is equivalent to %S in arrow for unit="s" | |
1611 | tsa = pa.array(ts, type=pa.timestamp("s", timezone)) | |
1612 | options = pc.StrftimeOptions("%S") | |
1613 | result = pc.strftime(tsa, options=options) | |
1614 | expected = pa.array(_fix_timestamp(ts.strftime("%S"))) | |
1615 | assert result.equals(expected) | |
1616 | ||
1617 | # Pandas %S.%f is equivalent to %S in arrow for unit="us" | |
1618 | tsa = pa.array(ts, type=pa.timestamp("us", timezone)) | |
1619 | options = pc.StrftimeOptions("%S") | |
1620 | result = pc.strftime(tsa, options=options) | |
1621 | expected = pa.array(_fix_timestamp(ts.strftime("%S.%f"))) | |
1622 | assert result.equals(expected) | |
1623 | ||
1624 | # Test setting locale | |
1625 | tsa = pa.array(ts, type=pa.timestamp("s", timezone)) | |
1626 | options = pc.StrftimeOptions(fmt, locale="C") | |
1627 | result = pc.strftime(tsa, options=options) | |
1628 | expected = pa.array(_fix_timestamp(ts.strftime(fmt))) | |
1629 | assert result.equals(expected) | |
1630 | ||
1631 | # Test timestamps without timezone | |
1632 | fmt = "%Y-%m-%dT%H:%M:%S" | |
1633 | ts = pd.to_datetime(times) | |
1634 | tsa = pa.array(ts, type=pa.timestamp("s")) | |
1635 | result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) | |
1636 | expected = pa.array(_fix_timestamp(ts.strftime(fmt))) | |
1637 | ||
1638 | assert result.equals(expected) | |
1639 | with pytest.raises(pa.ArrowInvalid, | |
1640 | match="Timezone not present, cannot convert to string"): | |
1641 | pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) | |
1642 | with pytest.raises(pa.ArrowInvalid, | |
1643 | match="Timezone not present, cannot convert to string"): | |
1644 | pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) | |
1645 | ||
1646 | ||
1647 | def _check_datetime_components(timestamps, timezone=None): | |
1648 | from pyarrow.vendored.version import Version | |
1649 | ||
1650 | ts = pd.to_datetime(timestamps).tz_localize( | |
1651 | "UTC").tz_convert(timezone).to_series() | |
1652 | tsa = pa.array(ts, pa.timestamp("ns", tz=timezone)) | |
1653 | ||
1654 | subseconds = ((ts.dt.microsecond * 10 ** 3 + | |
1655 | ts.dt.nanosecond) * 10 ** -9).round(9) | |
1656 | iso_calendar_fields = [ | |
1657 | pa.field('iso_year', pa.int64()), | |
1658 | pa.field('iso_week', pa.int64()), | |
1659 | pa.field('iso_day_of_week', pa.int64()) | |
1660 | ] | |
1661 | ||
1662 | if Version(pd.__version__) < Version("1.1.0"): | |
1663 | # https://github.com/pandas-dev/pandas/issues/33206 | |
1664 | iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64") | |
1665 | iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64") | |
1666 | iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64") | |
1667 | else: | |
1668 | # Casting is required because pandas isocalendar returns int32 | |
1669 | # while arrow isocalendar returns int64. | |
1670 | iso_year = ts.dt.isocalendar()["year"].astype("int64") | |
1671 | iso_week = ts.dt.isocalendar()["week"].astype("int64") | |
1672 | iso_day = ts.dt.isocalendar()["day"].astype("int64") | |
1673 | ||
1674 | iso_calendar = pa.StructArray.from_arrays( | |
1675 | [iso_year, iso_week, iso_day], | |
1676 | fields=iso_calendar_fields) | |
1677 | ||
1678 | assert pc.year(tsa).equals(pa.array(ts.dt.year)) | |
1679 | assert pc.month(tsa).equals(pa.array(ts.dt.month)) | |
1680 | assert pc.day(tsa).equals(pa.array(ts.dt.day)) | |
1681 | assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek)) | |
1682 | assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear)) | |
1683 | assert pc.iso_year(tsa).equals(pa.array(iso_year)) | |
1684 | assert pc.iso_week(tsa).equals(pa.array(iso_week)) | |
1685 | assert pc.iso_calendar(tsa).equals(iso_calendar) | |
1686 | assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter)) | |
1687 | assert pc.hour(tsa).equals(pa.array(ts.dt.hour)) | |
1688 | assert pc.minute(tsa).equals(pa.array(ts.dt.minute)) | |
1689 | assert pc.second(tsa).equals(pa.array(ts.dt.second.values)) | |
1690 | assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3)) | |
1691 | assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3)) | |
1692 | assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond)) | |
1693 | assert pc.subsecond(tsa).equals(pa.array(subseconds)) | |
1694 | ||
1695 | day_of_week_options = pc.DayOfWeekOptions( | |
1696 | count_from_zero=False, week_start=1) | |
1697 | assert pc.day_of_week(tsa, options=day_of_week_options).equals( | |
1698 | pa.array(ts.dt.dayofweek + 1)) | |
1699 | ||
1700 | week_options = pc.WeekOptions( | |
1701 | week_starts_monday=True, count_from_zero=False, | |
1702 | first_week_is_fully_in_year=False) | |
1703 | assert pc.week(tsa, options=week_options).equals(pa.array(iso_week)) | |
1704 | ||
1705 | ||
1706 | @pytest.mark.pandas | |
1707 | def test_extract_datetime_components(): | |
1708 | from pyarrow.vendored.version import Version | |
1709 | ||
1710 | timestamps = ["1970-01-01T00:00:59.123456789", | |
1711 | "2000-02-29T23:23:23.999999999", | |
1712 | "2033-05-18T03:33:20.000000000", | |
1713 | "2020-01-01T01:05:05.001", | |
1714 | "2019-12-31T02:10:10.002", | |
1715 | "2019-12-30T03:15:15.003", | |
1716 | "2009-12-31T04:20:20.004132", | |
1717 | "2010-01-01T05:25:25.005321", | |
1718 | "2010-01-03T06:30:30.006163", | |
1719 | "2010-01-04T07:35:35", | |
1720 | "2006-01-01T08:40:40", | |
1721 | "2005-12-31T09:45:45", | |
1722 | "2008-12-28", | |
1723 | "2008-12-29", | |
1724 | "2012-01-01 01:02:03"] | |
1725 | timezones = ["UTC", "US/Central", "Asia/Kolkata", | |
1726 | "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"] | |
1727 | ||
1728 | # Test timezone naive timestamp array | |
1729 | _check_datetime_components(timestamps) | |
1730 | ||
1731 | # Test timezone aware timestamp array | |
1732 | if sys.platform == 'win32': | |
1733 | # TODO: We should test on windows once ARROW-13168 is resolved. | |
1734 | pytest.skip('Timezone database is not available on Windows yet') | |
1735 | elif Version(pd.__version__) < Version('1.0.0'): | |
1736 | pytest.skip('Pandas < 1.0 extracts time components incorrectly.') | |
1737 | else: | |
1738 | for timezone in timezones: | |
1739 | _check_datetime_components(timestamps, timezone) | |
1740 | ||
1741 | ||
1742 | # TODO: We should test on windows once ARROW-13168 is resolved. | |
1743 | @pytest.mark.pandas | |
1744 | @pytest.mark.skipif(sys.platform == 'win32', | |
1745 | reason="Timezone database is not available on Windows yet") | |
1746 | def test_assume_timezone(): | |
1747 | from pyarrow.vendored.version import Version | |
1748 | ||
1749 | ts_type = pa.timestamp("ns") | |
1750 | timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789", | |
1751 | "2000-02-29T23:23:23.999999999", | |
1752 | "2033-05-18T03:33:20.000000000", | |
1753 | "2020-01-01T01:05:05.001", | |
1754 | "2019-12-31T02:10:10.002", | |
1755 | "2019-12-30T03:15:15.003", | |
1756 | "2009-12-31T04:20:20.004132", | |
1757 | "2010-01-01T05:25:25.005321", | |
1758 | "2010-01-03T06:30:30.006163", | |
1759 | "2010-01-04T07:35:35", | |
1760 | "2006-01-01T08:40:40", | |
1761 | "2005-12-31T09:45:45", | |
1762 | "2008-12-28", | |
1763 | "2008-12-29", | |
1764 | "2012-01-01 01:02:03"]) | |
1765 | nonexistent = pd.to_datetime(["2015-03-29 02:30:00", | |
1766 | "2015-03-29 03:30:00"]) | |
1767 | ambiguous = pd.to_datetime(["2018-10-28 01:20:00", | |
1768 | "2018-10-28 02:36:00", | |
1769 | "2018-10-28 03:46:00"]) | |
1770 | ambiguous_array = pa.array(ambiguous, type=ts_type) | |
1771 | nonexistent_array = pa.array(nonexistent, type=ts_type) | |
1772 | ||
1773 | for timezone in ["UTC", "US/Central", "Asia/Kolkata"]: | |
1774 | options = pc.AssumeTimezoneOptions(timezone) | |
1775 | ta = pa.array(timestamps, type=ts_type) | |
1776 | expected = timestamps.tz_localize(timezone) | |
1777 | result = pc.assume_timezone(ta, options=options) | |
1778 | assert result.equals(pa.array(expected)) | |
1779 | ||
1780 | ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) | |
1781 | with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): | |
1782 | pc.assume_timezone(ta_zoned, options=options) | |
1783 | ||
1784 | invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") | |
1785 | with pytest.raises(ValueError, match="not found in timezone database"): | |
1786 | pc.assume_timezone(ta, options=invalid_options) | |
1787 | ||
1788 | timezone = "Europe/Brussels" | |
1789 | ||
1790 | # nonexistent parameter was introduced in Pandas 0.24.0 | |
1791 | if Version(pd.__version__) >= Version("0.24.0"): | |
1792 | options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone) | |
1793 | options_nonexistent_earliest = pc.AssumeTimezoneOptions( | |
1794 | timezone, ambiguous="raise", nonexistent="earliest") | |
1795 | options_nonexistent_latest = pc.AssumeTimezoneOptions( | |
1796 | timezone, ambiguous="raise", nonexistent="latest") | |
1797 | ||
1798 | with pytest.raises(ValueError, | |
1799 | match="Timestamp doesn't exist in " | |
1800 | f"timezone '{timezone}'"): | |
1801 | pc.assume_timezone(nonexistent_array, | |
1802 | options=options_nonexistent_raise) | |
1803 | ||
1804 | expected = pa.array(nonexistent.tz_localize( | |
1805 | timezone, nonexistent="shift_forward")) | |
1806 | result = pc.assume_timezone( | |
1807 | nonexistent_array, options=options_nonexistent_latest) | |
1808 | expected.equals(result) | |
1809 | ||
1810 | expected = pa.array(nonexistent.tz_localize( | |
1811 | timezone, nonexistent="shift_backward")) | |
1812 | result = pc.assume_timezone( | |
1813 | nonexistent_array, options=options_nonexistent_earliest) | |
1814 | expected.equals(result) | |
1815 | ||
1816 | options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone) | |
1817 | options_ambiguous_latest = pc.AssumeTimezoneOptions( | |
1818 | timezone, ambiguous="latest", nonexistent="raise") | |
1819 | options_ambiguous_earliest = pc.AssumeTimezoneOptions( | |
1820 | timezone, ambiguous="earliest", nonexistent="raise") | |
1821 | ||
1822 | with pytest.raises(ValueError, | |
1823 | match="Timestamp is ambiguous in " | |
1824 | f"timezone '{timezone}'"): | |
1825 | pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) | |
1826 | ||
1827 | expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) | |
1828 | result = pc.assume_timezone( | |
1829 | ambiguous_array, options=options_ambiguous_earliest) | |
1830 | result.equals(pa.array(expected)) | |
1831 | ||
1832 | expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) | |
1833 | result = pc.assume_timezone( | |
1834 | ambiguous_array, options=options_ambiguous_latest) | |
1835 | result.equals(pa.array(expected)) | |
1836 | ||
1837 | ||
1838 | def test_count(): | |
1839 | arr = pa.array([1, 2, 3, None, None]) | |
1840 | assert pc.count(arr).as_py() == 3 | |
1841 | assert pc.count(arr, mode='only_valid').as_py() == 3 | |
1842 | assert pc.count(arr, mode='only_null').as_py() == 2 | |
1843 | assert pc.count(arr, mode='all').as_py() == 5 | |
1844 | ||
1845 | ||
1846 | def test_index(): | |
1847 | arr = pa.array([0, 1, None, 3, 4], type=pa.int64()) | |
1848 | assert pc.index(arr, pa.scalar(0)).as_py() == 0 | |
1849 | assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1 | |
1850 | assert pc.index(arr, 4).as_py() == 4 | |
1851 | assert arr.index(3, start=2).as_py() == 3 | |
1852 | assert arr.index(None).as_py() == -1 | |
1853 | ||
1854 | arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64()) | |
1855 | assert arr.index(1).as_py() == 0 | |
1856 | assert arr.index(1, start=2).as_py() == 2 | |
1857 | assert arr.index(1, start=1, end=2).as_py() == -1 | |
1858 | ||
1859 | ||
1860 | def check_partition_nth(data, indices, pivot, null_placement): | |
1861 | indices = indices.to_pylist() | |
1862 | assert len(indices) == len(data) | |
1863 | assert sorted(indices) == list(range(len(data))) | |
1864 | until_pivot = [data[indices[i]] for i in range(pivot)] | |
1865 | after_pivot = [data[indices[i]] for i in range(pivot, len(data))] | |
1866 | p = data[indices[pivot]] | |
1867 | if p is None: | |
1868 | if null_placement == "at_start": | |
1869 | assert all(v is None for v in until_pivot) | |
1870 | else: | |
1871 | assert all(v is None for v in after_pivot) | |
1872 | else: | |
1873 | if null_placement == "at_start": | |
1874 | assert all(v is None or v <= p for v in until_pivot) | |
1875 | assert all(v >= p for v in after_pivot) | |
1876 | else: | |
1877 | assert all(v <= p for v in until_pivot) | |
1878 | assert all(v is None or v >= p for v in after_pivot) | |
1879 | ||
1880 | ||
1881 | def test_partition_nth(): | |
1882 | data = list(range(100, 140)) | |
1883 | random.shuffle(data) | |
1884 | pivot = 10 | |
1885 | indices = pc.partition_nth_indices(data, pivot=pivot) | |
1886 | check_partition_nth(data, indices, pivot, "at_end") | |
1887 | ||
1888 | ||
1889 | def test_partition_nth_null_placement(): | |
1890 | data = list(range(10)) + [None] * 10 | |
1891 | random.shuffle(data) | |
1892 | ||
1893 | for pivot in (0, 7, 13, 19): | |
1894 | for null_placement in ("at_start", "at_end"): | |
1895 | indices = pc.partition_nth_indices(data, pivot=pivot, | |
1896 | null_placement=null_placement) | |
1897 | check_partition_nth(data, indices, pivot, null_placement) | |
1898 | ||
1899 | ||
1900 | def test_select_k_array(): | |
1901 | def validate_select_k(select_k_indices, arr, order, stable_sort=False): | |
1902 | sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)]) | |
1903 | head_k_indices = sorted_indices.slice(0, len(select_k_indices)) | |
1904 | if stable_sort: | |
1905 | assert select_k_indices == head_k_indices | |
1906 | else: | |
1907 | expected = pc.take(arr, head_k_indices) | |
1908 | actual = pc.take(arr, select_k_indices) | |
1909 | assert actual == expected | |
1910 | ||
1911 | arr = pa.array([1, 2, None, 0]) | |
1912 | for k in [0, 2, 4]: | |
1913 | for order in ["descending", "ascending"]: | |
1914 | result = pc.select_k_unstable( | |
1915 | arr, k=k, sort_keys=[("dummy", order)]) | |
1916 | validate_select_k(result, arr, order) | |
1917 | ||
1918 | result = pc.top_k_unstable(arr, k=k) | |
1919 | validate_select_k(result, arr, "descending") | |
1920 | ||
1921 | result = pc.bottom_k_unstable(arr, k=k) | |
1922 | validate_select_k(result, arr, "ascending") | |
1923 | ||
1924 | result = pc.select_k_unstable( | |
1925 | arr, options=pc.SelectKOptions( | |
1926 | k=2, sort_keys=[("dummy", "descending")]) | |
1927 | ) | |
1928 | validate_select_k(result, arr, "descending") | |
1929 | ||
1930 | result = pc.select_k_unstable( | |
1931 | arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")]) | |
1932 | ) | |
1933 | validate_select_k(result, arr, "ascending") | |
1934 | ||
1935 | ||
1936 | def test_select_k_table(): | |
1937 | def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): | |
1938 | sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys) | |
1939 | head_k_indices = sorted_indices.slice(0, len(select_k_indices)) | |
1940 | if stable_sort: | |
1941 | assert select_k_indices == head_k_indices | |
1942 | else: | |
1943 | expected = pc.take(tbl, head_k_indices) | |
1944 | actual = pc.take(tbl, select_k_indices) | |
1945 | assert actual == expected | |
1946 | ||
1947 | table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]}) | |
1948 | for k in [0, 2, 4]: | |
1949 | result = pc.select_k_unstable( | |
1950 | table, k=k, sort_keys=[("a", "ascending")]) | |
1951 | validate_select_k(result, table, sort_keys=[("a", "ascending")]) | |
1952 | ||
1953 | result = pc.select_k_unstable( | |
1954 | table, k=k, sort_keys=[("a", "ascending"), ("b", "ascending")]) | |
1955 | validate_select_k( | |
1956 | result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) | |
1957 | ||
1958 | result = pc.top_k_unstable(table, k=k, sort_keys=["a"]) | |
1959 | validate_select_k(result, table, sort_keys=[("a", "descending")]) | |
1960 | ||
1961 | result = pc.bottom_k_unstable(table, k=k, sort_keys=["a", "b"]) | |
1962 | validate_select_k( | |
1963 | result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) | |
1964 | ||
1965 | with pytest.raises(ValueError, | |
1966 | match="select_k_unstable requires a nonnegative `k`"): | |
1967 | pc.select_k_unstable(table) | |
1968 | ||
1969 | with pytest.raises(ValueError, | |
1970 | match="select_k_unstable requires a " | |
1971 | "non-empty `sort_keys`"): | |
1972 | pc.select_k_unstable(table, k=2, sort_keys=[]) | |
1973 | ||
1974 | with pytest.raises(ValueError, match="not a valid sort order"): | |
1975 | pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) | |
1976 | ||
1977 | with pytest.raises(ValueError, match="Nonexistent sort key column"): | |
1978 | pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) | |
1979 | ||
1980 | ||
1981 | def test_array_sort_indices(): | |
1982 | arr = pa.array([1, 2, None, 0]) | |
1983 | result = pc.array_sort_indices(arr) | |
1984 | assert result.to_pylist() == [3, 0, 1, 2] | |
1985 | result = pc.array_sort_indices(arr, order="ascending") | |
1986 | assert result.to_pylist() == [3, 0, 1, 2] | |
1987 | result = pc.array_sort_indices(arr, order="descending") | |
1988 | assert result.to_pylist() == [1, 0, 3, 2] | |
1989 | result = pc.array_sort_indices(arr, order="descending", | |
1990 | null_placement="at_start") | |
1991 | assert result.to_pylist() == [2, 1, 0, 3] | |
1992 | ||
1993 | with pytest.raises(ValueError, match="not a valid sort order"): | |
1994 | pc.array_sort_indices(arr, order="nonscending") | |
1995 | ||
1996 | ||
1997 | def test_sort_indices_array(): | |
1998 | arr = pa.array([1, 2, None, 0]) | |
1999 | result = pc.sort_indices(arr) | |
2000 | assert result.to_pylist() == [3, 0, 1, 2] | |
2001 | result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")]) | |
2002 | assert result.to_pylist() == [3, 0, 1, 2] | |
2003 | result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")]) | |
2004 | assert result.to_pylist() == [1, 0, 3, 2] | |
2005 | result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")], | |
2006 | null_placement="at_start") | |
2007 | assert result.to_pylist() == [2, 1, 0, 3] | |
2008 | # Using SortOptions | |
2009 | result = pc.sort_indices( | |
2010 | arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")]) | |
2011 | ) | |
2012 | assert result.to_pylist() == [1, 0, 3, 2] | |
2013 | result = pc.sort_indices( | |
2014 | arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")], | |
2015 | null_placement="at_start") | |
2016 | ) | |
2017 | assert result.to_pylist() == [2, 1, 0, 3] | |
2018 | ||
2019 | ||
2020 | def test_sort_indices_table(): | |
2021 | table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]}) | |
2022 | ||
2023 | result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) | |
2024 | assert result.to_pylist() == [3, 0, 1, 2] | |
2025 | result = pc.sort_indices(table, sort_keys=[("a", "ascending")], | |
2026 | null_placement="at_start") | |
2027 | assert result.to_pylist() == [2, 3, 0, 1] | |
2028 | ||
2029 | result = pc.sort_indices( | |
2030 | table, sort_keys=[("a", "descending"), ("b", "ascending")] | |
2031 | ) | |
2032 | assert result.to_pylist() == [1, 0, 3, 2] | |
2033 | result = pc.sort_indices( | |
2034 | table, sort_keys=[("a", "descending"), ("b", "ascending")], | |
2035 | null_placement="at_start" | |
2036 | ) | |
2037 | assert result.to_pylist() == [2, 1, 0, 3] | |
2038 | ||
2039 | with pytest.raises(ValueError, match="Must specify one or more sort keys"): | |
2040 | pc.sort_indices(table) | |
2041 | ||
2042 | with pytest.raises(ValueError, match="Nonexistent sort key column"): | |
2043 | pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) | |
2044 | ||
2045 | with pytest.raises(ValueError, match="not a valid sort order"): | |
2046 | pc.sort_indices(table, sort_keys=[("a", "nonscending")]) | |
2047 | ||
2048 | ||
2049 | def test_is_in(): | |
2050 | arr = pa.array([1, 2, None, 1, 2, 3]) | |
2051 | ||
2052 | result = pc.is_in(arr, value_set=pa.array([1, 3, None])) | |
2053 | assert result.to_pylist() == [True, False, True, True, False, True] | |
2054 | ||
2055 | result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) | |
2056 | assert result.to_pylist() == [True, False, False, True, False, True] | |
2057 | ||
2058 | result = pc.is_in(arr, value_set=pa.array([1, 3])) | |
2059 | assert result.to_pylist() == [True, False, False, True, False, True] | |
2060 | ||
2061 | result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) | |
2062 | assert result.to_pylist() == [True, False, False, True, False, True] | |
2063 | ||
2064 | ||
2065 | def test_index_in(): | |
2066 | arr = pa.array([1, 2, None, 1, 2, 3]) | |
2067 | ||
2068 | result = pc.index_in(arr, value_set=pa.array([1, 3, None])) | |
2069 | assert result.to_pylist() == [0, None, 2, 0, None, 1] | |
2070 | ||
2071 | result = pc.index_in(arr, value_set=pa.array([1, 3, None]), | |
2072 | skip_nulls=True) | |
2073 | assert result.to_pylist() == [0, None, None, 0, None, 1] | |
2074 | ||
2075 | result = pc.index_in(arr, value_set=pa.array([1, 3])) | |
2076 | assert result.to_pylist() == [0, None, None, 0, None, 1] | |
2077 | ||
2078 | result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) | |
2079 | assert result.to_pylist() == [0, None, None, 0, None, 1] | |
2080 | ||
2081 | ||
2082 | def test_quantile(): | |
2083 | arr = pa.array([1, 2, 3, 4]) | |
2084 | ||
2085 | result = pc.quantile(arr) | |
2086 | assert result.to_pylist() == [2.5] | |
2087 | ||
2088 | result = pc.quantile(arr, interpolation='lower') | |
2089 | assert result.to_pylist() == [2] | |
2090 | result = pc.quantile(arr, interpolation='higher') | |
2091 | assert result.to_pylist() == [3] | |
2092 | result = pc.quantile(arr, interpolation='nearest') | |
2093 | assert result.to_pylist() == [3] | |
2094 | result = pc.quantile(arr, interpolation='midpoint') | |
2095 | assert result.to_pylist() == [2.5] | |
2096 | result = pc.quantile(arr, interpolation='linear') | |
2097 | assert result.to_pylist() == [2.5] | |
2098 | ||
2099 | arr = pa.array([1, 2]) | |
2100 | ||
2101 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75]) | |
2102 | assert result.to_pylist() == [1.25, 1.5, 1.75] | |
2103 | ||
2104 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') | |
2105 | assert result.to_pylist() == [1, 1, 1] | |
2106 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') | |
2107 | assert result.to_pylist() == [2, 2, 2] | |
2108 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') | |
2109 | assert result.to_pylist() == [1.5, 1.5, 1.5] | |
2110 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') | |
2111 | assert result.to_pylist() == [1, 1, 2] | |
2112 | result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') | |
2113 | assert result.to_pylist() == [1.25, 1.5, 1.75] | |
2114 | ||
2115 | with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): | |
2116 | pc.quantile(arr, q=1.1) | |
2117 | with pytest.raises(ValueError, match="not a valid quantile interpolation"): | |
2118 | pc.quantile(arr, interpolation='zzz') | |
2119 | ||
2120 | ||
2121 | def test_tdigest(): | |
2122 | arr = pa.array([1, 2, 3, 4]) | |
2123 | result = pc.tdigest(arr) | |
2124 | assert result.to_pylist() == [2.5] | |
2125 | ||
2126 | arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) | |
2127 | result = pc.tdigest(arr) | |
2128 | assert result.to_pylist() == [2.5] | |
2129 | ||
2130 | arr = pa.array([1, 2, 3, 4]) | |
2131 | result = pc.tdigest(arr, q=[0, 0.5, 1]) | |
2132 | assert result.to_pylist() == [1, 2.5, 4] | |
2133 | ||
2134 | arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) | |
2135 | result = pc.tdigest(arr, q=[0, 0.5, 1]) | |
2136 | assert result.to_pylist() == [1, 2.5, 4] | |
2137 | ||
2138 | ||
2139 | def test_fill_null_segfault(): | |
2140 | # ARROW-12672 | |
2141 | arr = pa.array([None], pa.bool_()).fill_null(False) | |
2142 | result = arr.cast(pa.int8()) | |
2143 | assert result == pa.array([0], pa.int8()) | |
2144 | ||
2145 | ||
2146 | def test_min_max_element_wise(): | |
2147 | arr1 = pa.array([1, 2, 3]) | |
2148 | arr2 = pa.array([3, 1, 2]) | |
2149 | arr3 = pa.array([2, 3, None]) | |
2150 | ||
2151 | result = pc.max_element_wise(arr1, arr2) | |
2152 | assert result == pa.array([3, 2, 3]) | |
2153 | result = pc.min_element_wise(arr1, arr2) | |
2154 | assert result == pa.array([1, 1, 2]) | |
2155 | ||
2156 | result = pc.max_element_wise(arr1, arr2, arr3) | |
2157 | assert result == pa.array([3, 3, 3]) | |
2158 | result = pc.min_element_wise(arr1, arr2, arr3) | |
2159 | assert result == pa.array([1, 1, 2]) | |
2160 | ||
2161 | # with specifying the option | |
2162 | result = pc.max_element_wise(arr1, arr3, skip_nulls=True) | |
2163 | assert result == pa.array([2, 3, 3]) | |
2164 | result = pc.min_element_wise(arr1, arr3, skip_nulls=True) | |
2165 | assert result == pa.array([1, 2, 3]) | |
2166 | result = pc.max_element_wise( | |
2167 | arr1, arr3, options=pc.ElementWiseAggregateOptions()) | |
2168 | assert result == pa.array([2, 3, 3]) | |
2169 | result = pc.min_element_wise( | |
2170 | arr1, arr3, options=pc.ElementWiseAggregateOptions()) | |
2171 | assert result == pa.array([1, 2, 3]) | |
2172 | ||
2173 | # not skipping nulls | |
2174 | result = pc.max_element_wise(arr1, arr3, skip_nulls=False) | |
2175 | assert result == pa.array([2, 3, None]) | |
2176 | result = pc.min_element_wise(arr1, arr3, skip_nulls=False) | |
2177 | assert result == pa.array([1, 2, None]) | |
2178 | ||
2179 | ||
2180 | def test_make_struct(): | |
2181 | assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} | |
2182 | ||
2183 | assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == { | |
2184 | 'i': 1, 's': 'a'} | |
2185 | ||
2186 | assert pc.make_struct([1, 2, 3], | |
2187 | "a b c".split()) == pa.StructArray.from_arrays([ | |
2188 | [1, 2, 3], | |
2189 | "a b c".split()], names='0 1'.split()) | |
2190 | ||
2191 | with pytest.raises(ValueError, | |
2192 | match="Array arguments must all be the same length"): | |
2193 | pc.make_struct([1, 2, 3, 4], "a b c".split()) | |
2194 | ||
2195 | with pytest.raises(ValueError, match="0 arguments but 2 field names"): | |
2196 | pc.make_struct(field_names=['one', 'two']) | |
2197 | ||
2198 | ||
2199 | def test_case_when(): | |
2200 | assert pc.case_when(pc.make_struct([True, False, None], | |
2201 | [False, True, None]), | |
2202 | [1, 2, 3], | |
2203 | [11, 12, 13]) == pa.array([1, 12, None]) | |
2204 | ||
2205 | ||
2206 | def test_list_element(): | |
2207 | element_type = pa.struct([('a', pa.float64()), ('b', pa.int8())]) | |
2208 | list_type = pa.list_(element_type) | |
2209 | l1 = [{'a': .4, 'b': 2}, None, {'a': .2, 'b': 4}, None, {'a': 5.6, 'b': 6}] | |
2210 | l2 = [None, {'a': .52, 'b': 3}, {'a': .7, 'b': 4}, None, {'a': .6, 'b': 8}] | |
2211 | lists = pa.array([l1, l2], list_type) | |
2212 | ||
2213 | index = 1 | |
2214 | result = pa.compute.list_element(lists, index) | |
2215 | expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) | |
2216 | assert result.equals(expected) | |
2217 | ||
2218 | index = 4 | |
2219 | result = pa.compute.list_element(lists, index) | |
2220 | expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) | |
2221 | assert result.equals(expected) | |
2222 | ||
2223 | ||
2224 | def test_count_distinct(): | |
2225 | seed = datetime.now() | |
2226 | samples = [seed.replace(year=y) for y in range(1992, 2092)] | |
2227 | arr = pa.array(samples, pa.timestamp("ns")) | |
2228 | result = pa.compute.count_distinct(arr) | |
2229 | expected = pa.scalar(len(samples), type=pa.int64()) | |
2230 | assert result.equals(expected) | |
2231 | ||
2232 | ||
2233 | def test_count_distinct_options(): | |
2234 | arr = pa.array([1, 2, 3, None, None]) | |
2235 | assert pc.count_distinct(arr).as_py() == 3 | |
2236 | assert pc.count_distinct(arr, mode='only_valid').as_py() == 3 | |
2237 | assert pc.count_distinct(arr, mode='only_null').as_py() == 1 | |
2238 | assert pc.count_distinct(arr, mode='all').as_py() == 4 |