]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/python/pyarrow/compute.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / compute.py
CommitLineData
1d09f67e
TL
1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements. See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership. The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License. You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied. See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18from pyarrow._compute import ( # noqa
19 Function,
20 FunctionOptions,
21 FunctionRegistry,
22 HashAggregateFunction,
23 HashAggregateKernel,
24 Kernel,
25 ScalarAggregateFunction,
26 ScalarAggregateKernel,
27 ScalarFunction,
28 ScalarKernel,
29 VectorFunction,
30 VectorKernel,
31 # Option classes
32 ArraySortOptions,
33 AssumeTimezoneOptions,
34 CastOptions,
35 CountOptions,
36 DayOfWeekOptions,
37 DictionaryEncodeOptions,
38 ElementWiseAggregateOptions,
39 ExtractRegexOptions,
40 FilterOptions,
41 IndexOptions,
42 JoinOptions,
43 MakeStructOptions,
44 MatchSubstringOptions,
45 ModeOptions,
46 NullOptions,
47 PadOptions,
48 PartitionNthOptions,
49 QuantileOptions,
50 ReplaceSliceOptions,
51 ReplaceSubstringOptions,
52 RoundOptions,
53 RoundToMultipleOptions,
54 ScalarAggregateOptions,
55 SelectKOptions,
56 SetLookupOptions,
57 SliceOptions,
58 SortOptions,
59 SplitOptions,
60 SplitPatternOptions,
61 StrftimeOptions,
62 StrptimeOptions,
63 TakeOptions,
64 TDigestOptions,
65 TrimOptions,
66 VarianceOptions,
67 WeekOptions,
68 # Functions
69 call_function,
70 function_registry,
71 get_function,
72 list_functions,
73)
74
75import inspect
76from textwrap import dedent
77import warnings
78
79import pyarrow as pa
80
81
82def _get_arg_names(func):
83 return func._doc.arg_names
84
85
86def _decorate_compute_function(wrapper, exposed_name, func, option_class):
87 # Decorate the given compute function wrapper with useful metadata
88 # and documentation.
89 wrapper.__arrow_compute_function__ = dict(name=func.name,
90 arity=func.arity)
91 wrapper.__name__ = exposed_name
92 wrapper.__qualname__ = exposed_name
93
94 doc_pieces = []
95
96 cpp_doc = func._doc
97 summary = cpp_doc.summary
98 if not summary:
99 arg_str = "arguments" if func.arity > 1 else "argument"
100 summary = ("Call compute function {!r} with the given {}"
101 .format(func.name, arg_str))
102
103 description = cpp_doc.description
104 arg_names = _get_arg_names(func)
105
106 doc_pieces.append("""\
107 {}.
108
109 """.format(summary))
110
111 if description:
112 doc_pieces.append("{}\n\n".format(description))
113
114 doc_pieces.append("""\
115 Parameters
116 ----------
117 """)
118
119 for arg_name in arg_names:
120 if func.kind in ('vector', 'scalar_aggregate'):
121 arg_type = 'Array-like'
122 else:
123 arg_type = 'Array-like or scalar-like'
124 doc_pieces.append("""\
125 {} : {}
126 Argument to compute function
127 """.format(arg_name, arg_type))
128
129 doc_pieces.append("""\
130 memory_pool : pyarrow.MemoryPool, optional
131 If not passed, will allocate memory from the default memory pool.
132 """)
133 if option_class is not None:
134 doc_pieces.append("""\
135 options : pyarrow.compute.{0}, optional
136 Parameters altering compute function semantics.
137 """.format(option_class.__name__))
138 options_sig = inspect.signature(option_class)
139 for p in options_sig.parameters.values():
140 doc_pieces.append("""\
141 {0} : optional
142 Parameter for {1} constructor. Either `options`
143 or `{0}` can be passed, but not both at the same time.
144 """.format(p.name, option_class.__name__))
145
146 wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces)
147 return wrapper
148
149
150def _get_options_class(func):
151 class_name = func._doc.options_class
152 if not class_name:
153 return None
154 try:
155 return globals()[class_name]
156 except KeyError:
157 warnings.warn("Python binding for {} not exposed"
158 .format(class_name), RuntimeWarning)
159 return None
160
161
162def _handle_options(name, option_class, options, kwargs):
163 if kwargs:
164 if options is None:
165 return option_class(**kwargs)
166 raise TypeError(
167 "Function {!r} called with both an 'options' argument "
168 "and additional named arguments"
169 .format(name))
170
171 if options is not None:
172 if isinstance(options, dict):
173 return option_class(**options)
174 elif isinstance(options, option_class):
175 return options
176 raise TypeError(
177 "Function {!r} expected a {} parameter, got {}"
178 .format(name, option_class, type(options)))
179
180 return options
181
182
183def _make_generic_wrapper(func_name, func, option_class):
184 if option_class is None:
185 def wrapper(*args, memory_pool=None):
186 return func.call(args, None, memory_pool)
187 else:
188 def wrapper(*args, memory_pool=None, options=None, **kwargs):
189 options = _handle_options(func_name, option_class, options,
190 kwargs)
191 return func.call(args, options, memory_pool)
192 return wrapper
193
194
195def _make_signature(arg_names, var_arg_names, option_class):
196 from inspect import Parameter
197 params = []
198 for name in arg_names:
199 params.append(Parameter(name, Parameter.POSITIONAL_OR_KEYWORD))
200 for name in var_arg_names:
201 params.append(Parameter(name, Parameter.VAR_POSITIONAL))
202 params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
203 default=None))
204 if option_class is not None:
205 params.append(Parameter("options", Parameter.KEYWORD_ONLY,
206 default=None))
207 options_sig = inspect.signature(option_class)
208 for p in options_sig.parameters.values():
209 # XXX for now, our generic wrappers don't allow positional
210 # option arguments
211 params.append(p.replace(kind=Parameter.KEYWORD_ONLY))
212 return inspect.Signature(params)
213
214
215def _wrap_function(name, func):
216 option_class = _get_options_class(func)
217 arg_names = _get_arg_names(func)
218 has_vararg = arg_names and arg_names[-1].startswith('*')
219 if has_vararg:
220 var_arg_names = [arg_names.pop().lstrip('*')]
221 else:
222 var_arg_names = []
223
224 wrapper = _make_generic_wrapper(name, func, option_class)
225 wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
226 option_class)
227 return _decorate_compute_function(wrapper, name, func, option_class)
228
229
230def _make_global_functions():
231 """
232 Make global functions wrapping each compute function.
233
234 Note that some of the automatically-generated wrappers may be overriden
235 by custom versions below.
236 """
237 g = globals()
238 reg = function_registry()
239
240 # Avoid clashes with Python keywords
241 rewrites = {'and': 'and_',
242 'or': 'or_'}
243
244 for cpp_name in reg.list_functions():
245 name = rewrites.get(cpp_name, cpp_name)
246 func = reg.get_function(cpp_name)
247 assert name not in g, name
248 g[cpp_name] = g[name] = _wrap_function(name, func)
249
250
251_make_global_functions()
252
253
254def cast(arr, target_type, safe=True):
255 """
256 Cast array values to another data type. Can also be invoked as an array
257 instance method.
258
259 Parameters
260 ----------
261 arr : Array or ChunkedArray
262 target_type : DataType or type string alias
263 Type to cast to
264 safe : bool, default True
265 Check for overflows or other unsafe conversions
266
267 Examples
268 --------
269 >>> from datetime import datetime
270 >>> import pyarrow as pa
271 >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
272 >>> arr.type
273 TimestampType(timestamp[us])
274
275 You can use ``pyarrow.DataType`` objects to specify the target type:
276
277 >>> cast(arr, pa.timestamp('ms'))
278 <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
279 [
280 2010-01-01 00:00:00.000,
281 2015-01-01 00:00:00.000
282 ]
283
284 >>> cast(arr, pa.timestamp('ms')).type
285 TimestampType(timestamp[ms])
286
287 Alternatively, it is also supported to use the string aliases for these
288 types:
289
290 >>> arr.cast('timestamp[ms]')
291 <pyarrow.lib.TimestampArray object at 0x10420eb88>
292 [
293 1262304000000,
294 1420070400000
295 ]
296 >>> arr.cast('timestamp[ms]').type
297 TimestampType(timestamp[ms])
298
299 Returns
300 -------
301 casted : Array
302 """
303 if target_type is None:
304 raise ValueError("Cast target type must not be None")
305 if safe:
306 options = CastOptions.safe(target_type)
307 else:
308 options = CastOptions.unsafe(target_type)
309 return call_function("cast", [arr], options)
310
311
312def count_substring(array, pattern, *, ignore_case=False):
313 """
314 Count the occurrences of substring *pattern* in each value of a
315 string array.
316
317 Parameters
318 ----------
319 array : pyarrow.Array or pyarrow.ChunkedArray
320 pattern : str
321 pattern to search for exact matches
322 ignore_case : bool, default False
323 Ignore case while searching.
324
325 Returns
326 -------
327 result : pyarrow.Array or pyarrow.ChunkedArray
328 """
329 return call_function("count_substring", [array],
330 MatchSubstringOptions(pattern,
331 ignore_case=ignore_case))
332
333
334def count_substring_regex(array, pattern, *, ignore_case=False):
335 """
336 Count the non-overlapping matches of regex *pattern* in each value
337 of a string array.
338
339 Parameters
340 ----------
341 array : pyarrow.Array or pyarrow.ChunkedArray
342 pattern : str
343 pattern to search for exact matches
344 ignore_case : bool, default False
345 Ignore case while searching.
346
347 Returns
348 -------
349 result : pyarrow.Array or pyarrow.ChunkedArray
350 """
351 return call_function("count_substring_regex", [array],
352 MatchSubstringOptions(pattern,
353 ignore_case=ignore_case))
354
355
356def find_substring(array, pattern, *, ignore_case=False):
357 """
358 Find the index of the first occurrence of substring *pattern* in each
359 value of a string array.
360
361 Parameters
362 ----------
363 array : pyarrow.Array or pyarrow.ChunkedArray
364 pattern : str
365 pattern to search for exact matches
366 ignore_case : bool, default False
367 Ignore case while searching.
368
369 Returns
370 -------
371 result : pyarrow.Array or pyarrow.ChunkedArray
372 """
373 return call_function("find_substring", [array],
374 MatchSubstringOptions(pattern,
375 ignore_case=ignore_case))
376
377
378def find_substring_regex(array, pattern, *, ignore_case=False):
379 """
380 Find the index of the first match of regex *pattern* in each
381 value of a string array.
382
383 Parameters
384 ----------
385 array : pyarrow.Array or pyarrow.ChunkedArray
386 pattern : str
387 regex pattern to search for
388 ignore_case : bool, default False
389 Ignore case while searching.
390
391 Returns
392 -------
393 result : pyarrow.Array or pyarrow.ChunkedArray
394 """
395 return call_function("find_substring_regex", [array],
396 MatchSubstringOptions(pattern,
397 ignore_case=ignore_case))
398
399
400def match_like(array, pattern, *, ignore_case=False):
401 """
402 Test if the SQL-style LIKE pattern *pattern* matches a value of a
403 string array.
404
405 Parameters
406 ----------
407 array : pyarrow.Array or pyarrow.ChunkedArray
408 pattern : str
409 SQL-style LIKE pattern. '%' will match any number of
410 characters, '_' will match exactly one character, and all
411 other characters match themselves. To match a literal percent
412 sign or underscore, precede the character with a backslash.
413 ignore_case : bool, default False
414 Ignore case while searching.
415
416 Returns
417 -------
418 result : pyarrow.Array or pyarrow.ChunkedArray
419
420 """
421 return call_function("match_like", [array],
422 MatchSubstringOptions(pattern,
423 ignore_case=ignore_case))
424
425
426def match_substring(array, pattern, *, ignore_case=False):
427 """
428 Test if substring *pattern* is contained within a value of a string array.
429
430 Parameters
431 ----------
432 array : pyarrow.Array or pyarrow.ChunkedArray
433 pattern : str
434 pattern to search for exact matches
435 ignore_case : bool, default False
436 Ignore case while searching.
437
438 Returns
439 -------
440 result : pyarrow.Array or pyarrow.ChunkedArray
441 """
442 return call_function("match_substring", [array],
443 MatchSubstringOptions(pattern,
444 ignore_case=ignore_case))
445
446
447def match_substring_regex(array, pattern, *, ignore_case=False):
448 """
449 Test if regex *pattern* matches at any position a value of a string array.
450
451 Parameters
452 ----------
453 array : pyarrow.Array or pyarrow.ChunkedArray
454 pattern : str
455 regex pattern to search
456 ignore_case : bool, default False
457 Ignore case while searching.
458
459 Returns
460 -------
461 result : pyarrow.Array or pyarrow.ChunkedArray
462 """
463 return call_function("match_substring_regex", [array],
464 MatchSubstringOptions(pattern,
465 ignore_case=ignore_case))
466
467
468def mode(array, n=1, *, skip_nulls=True, min_count=0):
469 """
470 Return top-n most common values and number of times they occur in a passed
471 numerical (chunked) array, in descending order of occurrence. If there are
472 multiple values with same count, the smaller one is returned first.
473
474 Parameters
475 ----------
476 array : pyarrow.Array or pyarrow.ChunkedArray
477 n : int, default 1
478 Specify the top-n values.
479 skip_nulls : bool, default True
480 If True, ignore nulls in the input. Else return an empty array
481 if any input is null.
482 min_count : int, default 0
483 If there are fewer than this many values in the input, return
484 an empty array.
485
486 Returns
487 -------
488 An array of <input type "Mode", int64_t "Count"> structs
489
490 Examples
491 --------
492 >>> import pyarrow as pa
493 >>> import pyarrow.compute as pc
494 >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
495 >>> modes = pc.mode(arr, 2)
496 >>> modes[0]
497 <pyarrow.StructScalar: {'mode': 2, 'count': 5}>
498 >>> modes[1]
499 <pyarrow.StructScalar: {'mode': 1, 'count': 2}>
500 """
501 options = ModeOptions(n, skip_nulls=skip_nulls, min_count=min_count)
502 return call_function("mode", [array], options)
503
504
505def filter(data, mask, null_selection_behavior='drop'):
506 """
507 Select values (or records) from array- or table-like data given boolean
508 filter, where true values are selected.
509
510 Parameters
511 ----------
512 data : Array, ChunkedArray, RecordBatch, or Table
513 mask : Array, ChunkedArray
514 Must be of boolean type
515 null_selection_behavior : str, default 'drop'
516 Configure the behavior on encountering a null slot in the mask.
517 Allowed values are 'drop' and 'emit_null'.
518
519 - 'drop': nulls will be treated as equivalent to False.
520 - 'emit_null': nulls will result in a null in the output.
521
522 Returns
523 -------
524 result : depends on inputs
525
526 Examples
527 --------
528 >>> import pyarrow as pa
529 >>> arr = pa.array(["a", "b", "c", None, "e"])
530 >>> mask = pa.array([True, False, None, False, True])
531 >>> arr.filter(mask)
532 <pyarrow.lib.StringArray object at 0x7fa826df9200>
533 [
534 "a",
535 "e"
536 ]
537 >>> arr.filter(mask, null_selection_behavior='emit_null')
538 <pyarrow.lib.StringArray object at 0x7fa826df9200>
539 [
540 "a",
541 null,
542 "e"
543 ]
544 """
545 options = FilterOptions(null_selection_behavior)
546 return call_function('filter', [data, mask], options)
547
548
549def index(data, value, start=None, end=None, *, memory_pool=None):
550 """
551 Find the index of the first occurrence of a given value.
552
553 Parameters
554 ----------
555 data : Array or ChunkedArray
556 value : Scalar-like object
557 start : int, optional
558 end : int, optional
559 memory_pool : MemoryPool, optional
560 If not passed, will allocate memory from the default memory pool.
561
562 Returns
563 -------
564 index : the index, or -1 if not found
565 """
566 if start is not None:
567 if end is not None:
568 data = data.slice(start, end - start)
569 else:
570 data = data.slice(start)
571 elif end is not None:
572 data = data.slice(0, end)
573
574 if not isinstance(value, pa.Scalar):
575 value = pa.scalar(value, type=data.type)
576 elif data.type != value.type:
577 value = pa.scalar(value.as_py(), type=data.type)
578 options = IndexOptions(value=value)
579 result = call_function('index', [data], options, memory_pool)
580 if start is not None and result.as_py() >= 0:
581 result = pa.scalar(result.as_py() + start, type=pa.int64())
582 return result
583
584
585def take(data, indices, *, boundscheck=True, memory_pool=None):
586 """
587 Select values (or records) from array- or table-like data given integer
588 selection indices.
589
590 The result will be of the same type(s) as the input, with elements taken
591 from the input array (or record batch / table fields) at the given
592 indices. If an index is null then the corresponding value in the output
593 will be null.
594
595 Parameters
596 ----------
597 data : Array, ChunkedArray, RecordBatch, or Table
598 indices : Array, ChunkedArray
599 Must be of integer type
600 boundscheck : boolean, default True
601 Whether to boundscheck the indices. If False and there is an out of
602 bounds index, will likely cause the process to crash.
603 memory_pool : MemoryPool, optional
604 If not passed, will allocate memory from the default memory pool.
605
606 Returns
607 -------
608 result : depends on inputs
609
610 Examples
611 --------
612 >>> import pyarrow as pa
613 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
614 >>> indices = pa.array([0, None, 4, 3])
615 >>> arr.take(indices)
616 <pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
617 [
618 "a",
619 null,
620 "e",
621 null
622 ]
623 """
624 options = TakeOptions(boundscheck=boundscheck)
625 return call_function('take', [data, indices], options, memory_pool)
626
627
628def fill_null(values, fill_value):
629 """
630 Replace each null element in values with fill_value. The fill_value must be
631 the same type as values or able to be implicitly casted to the array's
632 type.
633
634 This is an alias for :func:`coalesce`.
635
636 Parameters
637 ----------
638 values : Array, ChunkedArray, or Scalar-like object
639 Each null element is replaced with the corresponding value
640 from fill_value.
641 fill_value : Array, ChunkedArray, or Scalar-like object
642 If not same type as data will attempt to cast.
643
644 Returns
645 -------
646 result : depends on inputs
647
648 Examples
649 --------
650 >>> import pyarrow as pa
651 >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
652 >>> fill_value = pa.scalar(5, type=pa.int8())
653 >>> arr.fill_null(fill_value)
654 pyarrow.lib.Int8Array object at 0x7f95437f01a0>
655 [
656 1,
657 2,
658 5,
659 3
660 ]
661 """
662 if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
663 fill_value = pa.scalar(fill_value, type=values.type)
664 elif values.type != fill_value.type:
665 fill_value = pa.scalar(fill_value.as_py(), type=values.type)
666
667 return call_function("coalesce", [values, fill_value])
668
669
670def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
671 """
672 Select the indices of the top-k ordered elements from array- or table-like
673 data.
674
675 This is a specialization for :func:`select_k_unstable`. Output is not
676 guaranteed to be stable.
677
678 Parameters
679 ----------
680 values : Array, ChunkedArray, RecordBatch, or Table
681 Data to sort and get top indices from.
682 k : int
683 The number of `k` elements to keep.
684 sort_keys : List-like
685 Column key names to order by when input is table-like data.
686 memory_pool : MemoryPool, optional
687 If not passed, will allocate memory from the default memory pool.
688
689 Returns
690 -------
691 result : Array of indices
692
693 Examples
694 --------
695 >>> import pyarrow as pa
696 >>> import pyarrow.compute as pc
697 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
698 >>> pc.top_k_unstable(arr, k=3)
699 <pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30>
700 [
701 5,
702 4,
703 2
704 ]
705 """
706 if sort_keys is None:
707 sort_keys = []
708 if isinstance(values, (pa.Array, pa.ChunkedArray)):
709 sort_keys.append(("dummy", "descending"))
710 else:
711 sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
712 options = SelectKOptions(k, sort_keys)
713 return call_function("select_k_unstable", [values], options, memory_pool)
714
715
716def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
717 """
718 Select the indices of the bottom-k ordered elements from
719 array- or table-like data.
720
721 This is a specialization for :func:`select_k_unstable`. Output is not
722 guaranteed to be stable.
723
724 Parameters
725 ----------
726 values : Array, ChunkedArray, RecordBatch, or Table
727 Data to sort and get bottom indices from.
728 k : int
729 The number of `k` elements to keep.
730 sort_keys : List-like
731 Column key names to order by when input is table-like data.
732 memory_pool : MemoryPool, optional
733 If not passed, will allocate memory from the default memory pool.
734
735 Returns
736 -------
737 result : Array of indices
738
739 Examples
740 --------
741 >>> import pyarrow as pa
742 >>> import pyarrow.compute as pc
743 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
744 >>> pc.bottom_k_unstable(arr, k=3)
745 <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0>
746 [
747 0,
748 1,
749 2
750 ]
751 """
752 if sort_keys is None:
753 sort_keys = []
754 if isinstance(values, (pa.Array, pa.ChunkedArray)):
755 sort_keys.append(("dummy", "ascending"))
756 else:
757 sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
758 options = SelectKOptions(k, sort_keys)
759 return call_function("select_k_unstable", [values], options, memory_pool)