]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | from pyarrow._compute import ( # noqa | |
19 | Function, | |
20 | FunctionOptions, | |
21 | FunctionRegistry, | |
22 | HashAggregateFunction, | |
23 | HashAggregateKernel, | |
24 | Kernel, | |
25 | ScalarAggregateFunction, | |
26 | ScalarAggregateKernel, | |
27 | ScalarFunction, | |
28 | ScalarKernel, | |
29 | VectorFunction, | |
30 | VectorKernel, | |
31 | # Option classes | |
32 | ArraySortOptions, | |
33 | AssumeTimezoneOptions, | |
34 | CastOptions, | |
35 | CountOptions, | |
36 | DayOfWeekOptions, | |
37 | DictionaryEncodeOptions, | |
38 | ElementWiseAggregateOptions, | |
39 | ExtractRegexOptions, | |
40 | FilterOptions, | |
41 | IndexOptions, | |
42 | JoinOptions, | |
43 | MakeStructOptions, | |
44 | MatchSubstringOptions, | |
45 | ModeOptions, | |
46 | NullOptions, | |
47 | PadOptions, | |
48 | PartitionNthOptions, | |
49 | QuantileOptions, | |
50 | ReplaceSliceOptions, | |
51 | ReplaceSubstringOptions, | |
52 | RoundOptions, | |
53 | RoundToMultipleOptions, | |
54 | ScalarAggregateOptions, | |
55 | SelectKOptions, | |
56 | SetLookupOptions, | |
57 | SliceOptions, | |
58 | SortOptions, | |
59 | SplitOptions, | |
60 | SplitPatternOptions, | |
61 | StrftimeOptions, | |
62 | StrptimeOptions, | |
63 | TakeOptions, | |
64 | TDigestOptions, | |
65 | TrimOptions, | |
66 | VarianceOptions, | |
67 | WeekOptions, | |
68 | # Functions | |
69 | call_function, | |
70 | function_registry, | |
71 | get_function, | |
72 | list_functions, | |
73 | ) | |
74 | ||
75 | import inspect | |
76 | from textwrap import dedent | |
77 | import warnings | |
78 | ||
79 | import pyarrow as pa | |
80 | ||
81 | ||
82 | def _get_arg_names(func): | |
83 | return func._doc.arg_names | |
84 | ||
85 | ||
86 | def _decorate_compute_function(wrapper, exposed_name, func, option_class): | |
87 | # Decorate the given compute function wrapper with useful metadata | |
88 | # and documentation. | |
89 | wrapper.__arrow_compute_function__ = dict(name=func.name, | |
90 | arity=func.arity) | |
91 | wrapper.__name__ = exposed_name | |
92 | wrapper.__qualname__ = exposed_name | |
93 | ||
94 | doc_pieces = [] | |
95 | ||
96 | cpp_doc = func._doc | |
97 | summary = cpp_doc.summary | |
98 | if not summary: | |
99 | arg_str = "arguments" if func.arity > 1 else "argument" | |
100 | summary = ("Call compute function {!r} with the given {}" | |
101 | .format(func.name, arg_str)) | |
102 | ||
103 | description = cpp_doc.description | |
104 | arg_names = _get_arg_names(func) | |
105 | ||
106 | doc_pieces.append("""\ | |
107 | {}. | |
108 | ||
109 | """.format(summary)) | |
110 | ||
111 | if description: | |
112 | doc_pieces.append("{}\n\n".format(description)) | |
113 | ||
114 | doc_pieces.append("""\ | |
115 | Parameters | |
116 | ---------- | |
117 | """) | |
118 | ||
119 | for arg_name in arg_names: | |
120 | if func.kind in ('vector', 'scalar_aggregate'): | |
121 | arg_type = 'Array-like' | |
122 | else: | |
123 | arg_type = 'Array-like or scalar-like' | |
124 | doc_pieces.append("""\ | |
125 | {} : {} | |
126 | Argument to compute function | |
127 | """.format(arg_name, arg_type)) | |
128 | ||
129 | doc_pieces.append("""\ | |
130 | memory_pool : pyarrow.MemoryPool, optional | |
131 | If not passed, will allocate memory from the default memory pool. | |
132 | """) | |
133 | if option_class is not None: | |
134 | doc_pieces.append("""\ | |
135 | options : pyarrow.compute.{0}, optional | |
136 | Parameters altering compute function semantics. | |
137 | """.format(option_class.__name__)) | |
138 | options_sig = inspect.signature(option_class) | |
139 | for p in options_sig.parameters.values(): | |
140 | doc_pieces.append("""\ | |
141 | {0} : optional | |
142 | Parameter for {1} constructor. Either `options` | |
143 | or `{0}` can be passed, but not both at the same time. | |
144 | """.format(p.name, option_class.__name__)) | |
145 | ||
146 | wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) | |
147 | return wrapper | |
148 | ||
149 | ||
150 | def _get_options_class(func): | |
151 | class_name = func._doc.options_class | |
152 | if not class_name: | |
153 | return None | |
154 | try: | |
155 | return globals()[class_name] | |
156 | except KeyError: | |
157 | warnings.warn("Python binding for {} not exposed" | |
158 | .format(class_name), RuntimeWarning) | |
159 | return None | |
160 | ||
161 | ||
162 | def _handle_options(name, option_class, options, kwargs): | |
163 | if kwargs: | |
164 | if options is None: | |
165 | return option_class(**kwargs) | |
166 | raise TypeError( | |
167 | "Function {!r} called with both an 'options' argument " | |
168 | "and additional named arguments" | |
169 | .format(name)) | |
170 | ||
171 | if options is not None: | |
172 | if isinstance(options, dict): | |
173 | return option_class(**options) | |
174 | elif isinstance(options, option_class): | |
175 | return options | |
176 | raise TypeError( | |
177 | "Function {!r} expected a {} parameter, got {}" | |
178 | .format(name, option_class, type(options))) | |
179 | ||
180 | return options | |
181 | ||
182 | ||
183 | def _make_generic_wrapper(func_name, func, option_class): | |
184 | if option_class is None: | |
185 | def wrapper(*args, memory_pool=None): | |
186 | return func.call(args, None, memory_pool) | |
187 | else: | |
188 | def wrapper(*args, memory_pool=None, options=None, **kwargs): | |
189 | options = _handle_options(func_name, option_class, options, | |
190 | kwargs) | |
191 | return func.call(args, options, memory_pool) | |
192 | return wrapper | |
193 | ||
194 | ||
195 | def _make_signature(arg_names, var_arg_names, option_class): | |
196 | from inspect import Parameter | |
197 | params = [] | |
198 | for name in arg_names: | |
199 | params.append(Parameter(name, Parameter.POSITIONAL_OR_KEYWORD)) | |
200 | for name in var_arg_names: | |
201 | params.append(Parameter(name, Parameter.VAR_POSITIONAL)) | |
202 | params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY, | |
203 | default=None)) | |
204 | if option_class is not None: | |
205 | params.append(Parameter("options", Parameter.KEYWORD_ONLY, | |
206 | default=None)) | |
207 | options_sig = inspect.signature(option_class) | |
208 | for p in options_sig.parameters.values(): | |
209 | # XXX for now, our generic wrappers don't allow positional | |
210 | # option arguments | |
211 | params.append(p.replace(kind=Parameter.KEYWORD_ONLY)) | |
212 | return inspect.Signature(params) | |
213 | ||
214 | ||
215 | def _wrap_function(name, func): | |
216 | option_class = _get_options_class(func) | |
217 | arg_names = _get_arg_names(func) | |
218 | has_vararg = arg_names and arg_names[-1].startswith('*') | |
219 | if has_vararg: | |
220 | var_arg_names = [arg_names.pop().lstrip('*')] | |
221 | else: | |
222 | var_arg_names = [] | |
223 | ||
224 | wrapper = _make_generic_wrapper(name, func, option_class) | |
225 | wrapper.__signature__ = _make_signature(arg_names, var_arg_names, | |
226 | option_class) | |
227 | return _decorate_compute_function(wrapper, name, func, option_class) | |
228 | ||
229 | ||
230 | def _make_global_functions(): | |
231 | """ | |
232 | Make global functions wrapping each compute function. | |
233 | ||
234 | Note that some of the automatically-generated wrappers may be overriden | |
235 | by custom versions below. | |
236 | """ | |
237 | g = globals() | |
238 | reg = function_registry() | |
239 | ||
240 | # Avoid clashes with Python keywords | |
241 | rewrites = {'and': 'and_', | |
242 | 'or': 'or_'} | |
243 | ||
244 | for cpp_name in reg.list_functions(): | |
245 | name = rewrites.get(cpp_name, cpp_name) | |
246 | func = reg.get_function(cpp_name) | |
247 | assert name not in g, name | |
248 | g[cpp_name] = g[name] = _wrap_function(name, func) | |
249 | ||
250 | ||
251 | _make_global_functions() | |
252 | ||
253 | ||
254 | def cast(arr, target_type, safe=True): | |
255 | """ | |
256 | Cast array values to another data type. Can also be invoked as an array | |
257 | instance method. | |
258 | ||
259 | Parameters | |
260 | ---------- | |
261 | arr : Array or ChunkedArray | |
262 | target_type : DataType or type string alias | |
263 | Type to cast to | |
264 | safe : bool, default True | |
265 | Check for overflows or other unsafe conversions | |
266 | ||
267 | Examples | |
268 | -------- | |
269 | >>> from datetime import datetime | |
270 | >>> import pyarrow as pa | |
271 | >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) | |
272 | >>> arr.type | |
273 | TimestampType(timestamp[us]) | |
274 | ||
275 | You can use ``pyarrow.DataType`` objects to specify the target type: | |
276 | ||
277 | >>> cast(arr, pa.timestamp('ms')) | |
278 | <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910> | |
279 | [ | |
280 | 2010-01-01 00:00:00.000, | |
281 | 2015-01-01 00:00:00.000 | |
282 | ] | |
283 | ||
284 | >>> cast(arr, pa.timestamp('ms')).type | |
285 | TimestampType(timestamp[ms]) | |
286 | ||
287 | Alternatively, it is also supported to use the string aliases for these | |
288 | types: | |
289 | ||
290 | >>> arr.cast('timestamp[ms]') | |
291 | <pyarrow.lib.TimestampArray object at 0x10420eb88> | |
292 | [ | |
293 | 1262304000000, | |
294 | 1420070400000 | |
295 | ] | |
296 | >>> arr.cast('timestamp[ms]').type | |
297 | TimestampType(timestamp[ms]) | |
298 | ||
299 | Returns | |
300 | ------- | |
301 | casted : Array | |
302 | """ | |
303 | if target_type is None: | |
304 | raise ValueError("Cast target type must not be None") | |
305 | if safe: | |
306 | options = CastOptions.safe(target_type) | |
307 | else: | |
308 | options = CastOptions.unsafe(target_type) | |
309 | return call_function("cast", [arr], options) | |
310 | ||
311 | ||
312 | def count_substring(array, pattern, *, ignore_case=False): | |
313 | """ | |
314 | Count the occurrences of substring *pattern* in each value of a | |
315 | string array. | |
316 | ||
317 | Parameters | |
318 | ---------- | |
319 | array : pyarrow.Array or pyarrow.ChunkedArray | |
320 | pattern : str | |
321 | pattern to search for exact matches | |
322 | ignore_case : bool, default False | |
323 | Ignore case while searching. | |
324 | ||
325 | Returns | |
326 | ------- | |
327 | result : pyarrow.Array or pyarrow.ChunkedArray | |
328 | """ | |
329 | return call_function("count_substring", [array], | |
330 | MatchSubstringOptions(pattern, | |
331 | ignore_case=ignore_case)) | |
332 | ||
333 | ||
334 | def count_substring_regex(array, pattern, *, ignore_case=False): | |
335 | """ | |
336 | Count the non-overlapping matches of regex *pattern* in each value | |
337 | of a string array. | |
338 | ||
339 | Parameters | |
340 | ---------- | |
341 | array : pyarrow.Array or pyarrow.ChunkedArray | |
342 | pattern : str | |
343 | pattern to search for exact matches | |
344 | ignore_case : bool, default False | |
345 | Ignore case while searching. | |
346 | ||
347 | Returns | |
348 | ------- | |
349 | result : pyarrow.Array or pyarrow.ChunkedArray | |
350 | """ | |
351 | return call_function("count_substring_regex", [array], | |
352 | MatchSubstringOptions(pattern, | |
353 | ignore_case=ignore_case)) | |
354 | ||
355 | ||
356 | def find_substring(array, pattern, *, ignore_case=False): | |
357 | """ | |
358 | Find the index of the first occurrence of substring *pattern* in each | |
359 | value of a string array. | |
360 | ||
361 | Parameters | |
362 | ---------- | |
363 | array : pyarrow.Array or pyarrow.ChunkedArray | |
364 | pattern : str | |
365 | pattern to search for exact matches | |
366 | ignore_case : bool, default False | |
367 | Ignore case while searching. | |
368 | ||
369 | Returns | |
370 | ------- | |
371 | result : pyarrow.Array or pyarrow.ChunkedArray | |
372 | """ | |
373 | return call_function("find_substring", [array], | |
374 | MatchSubstringOptions(pattern, | |
375 | ignore_case=ignore_case)) | |
376 | ||
377 | ||
378 | def find_substring_regex(array, pattern, *, ignore_case=False): | |
379 | """ | |
380 | Find the index of the first match of regex *pattern* in each | |
381 | value of a string array. | |
382 | ||
383 | Parameters | |
384 | ---------- | |
385 | array : pyarrow.Array or pyarrow.ChunkedArray | |
386 | pattern : str | |
387 | regex pattern to search for | |
388 | ignore_case : bool, default False | |
389 | Ignore case while searching. | |
390 | ||
391 | Returns | |
392 | ------- | |
393 | result : pyarrow.Array or pyarrow.ChunkedArray | |
394 | """ | |
395 | return call_function("find_substring_regex", [array], | |
396 | MatchSubstringOptions(pattern, | |
397 | ignore_case=ignore_case)) | |
398 | ||
399 | ||
400 | def match_like(array, pattern, *, ignore_case=False): | |
401 | """ | |
402 | Test if the SQL-style LIKE pattern *pattern* matches a value of a | |
403 | string array. | |
404 | ||
405 | Parameters | |
406 | ---------- | |
407 | array : pyarrow.Array or pyarrow.ChunkedArray | |
408 | pattern : str | |
409 | SQL-style LIKE pattern. '%' will match any number of | |
410 | characters, '_' will match exactly one character, and all | |
411 | other characters match themselves. To match a literal percent | |
412 | sign or underscore, precede the character with a backslash. | |
413 | ignore_case : bool, default False | |
414 | Ignore case while searching. | |
415 | ||
416 | Returns | |
417 | ------- | |
418 | result : pyarrow.Array or pyarrow.ChunkedArray | |
419 | ||
420 | """ | |
421 | return call_function("match_like", [array], | |
422 | MatchSubstringOptions(pattern, | |
423 | ignore_case=ignore_case)) | |
424 | ||
425 | ||
426 | def match_substring(array, pattern, *, ignore_case=False): | |
427 | """ | |
428 | Test if substring *pattern* is contained within a value of a string array. | |
429 | ||
430 | Parameters | |
431 | ---------- | |
432 | array : pyarrow.Array or pyarrow.ChunkedArray | |
433 | pattern : str | |
434 | pattern to search for exact matches | |
435 | ignore_case : bool, default False | |
436 | Ignore case while searching. | |
437 | ||
438 | Returns | |
439 | ------- | |
440 | result : pyarrow.Array or pyarrow.ChunkedArray | |
441 | """ | |
442 | return call_function("match_substring", [array], | |
443 | MatchSubstringOptions(pattern, | |
444 | ignore_case=ignore_case)) | |
445 | ||
446 | ||
447 | def match_substring_regex(array, pattern, *, ignore_case=False): | |
448 | """ | |
449 | Test if regex *pattern* matches at any position a value of a string array. | |
450 | ||
451 | Parameters | |
452 | ---------- | |
453 | array : pyarrow.Array or pyarrow.ChunkedArray | |
454 | pattern : str | |
455 | regex pattern to search | |
456 | ignore_case : bool, default False | |
457 | Ignore case while searching. | |
458 | ||
459 | Returns | |
460 | ------- | |
461 | result : pyarrow.Array or pyarrow.ChunkedArray | |
462 | """ | |
463 | return call_function("match_substring_regex", [array], | |
464 | MatchSubstringOptions(pattern, | |
465 | ignore_case=ignore_case)) | |
466 | ||
467 | ||
468 | def mode(array, n=1, *, skip_nulls=True, min_count=0): | |
469 | """ | |
470 | Return top-n most common values and number of times they occur in a passed | |
471 | numerical (chunked) array, in descending order of occurrence. If there are | |
472 | multiple values with same count, the smaller one is returned first. | |
473 | ||
474 | Parameters | |
475 | ---------- | |
476 | array : pyarrow.Array or pyarrow.ChunkedArray | |
477 | n : int, default 1 | |
478 | Specify the top-n values. | |
479 | skip_nulls : bool, default True | |
480 | If True, ignore nulls in the input. Else return an empty array | |
481 | if any input is null. | |
482 | min_count : int, default 0 | |
483 | If there are fewer than this many values in the input, return | |
484 | an empty array. | |
485 | ||
486 | Returns | |
487 | ------- | |
488 | An array of <input type "Mode", int64_t "Count"> structs | |
489 | ||
490 | Examples | |
491 | -------- | |
492 | >>> import pyarrow as pa | |
493 | >>> import pyarrow.compute as pc | |
494 | >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) | |
495 | >>> modes = pc.mode(arr, 2) | |
496 | >>> modes[0] | |
497 | <pyarrow.StructScalar: {'mode': 2, 'count': 5}> | |
498 | >>> modes[1] | |
499 | <pyarrow.StructScalar: {'mode': 1, 'count': 2}> | |
500 | """ | |
501 | options = ModeOptions(n, skip_nulls=skip_nulls, min_count=min_count) | |
502 | return call_function("mode", [array], options) | |
503 | ||
504 | ||
505 | def filter(data, mask, null_selection_behavior='drop'): | |
506 | """ | |
507 | Select values (or records) from array- or table-like data given boolean | |
508 | filter, where true values are selected. | |
509 | ||
510 | Parameters | |
511 | ---------- | |
512 | data : Array, ChunkedArray, RecordBatch, or Table | |
513 | mask : Array, ChunkedArray | |
514 | Must be of boolean type | |
515 | null_selection_behavior : str, default 'drop' | |
516 | Configure the behavior on encountering a null slot in the mask. | |
517 | Allowed values are 'drop' and 'emit_null'. | |
518 | ||
519 | - 'drop': nulls will be treated as equivalent to False. | |
520 | - 'emit_null': nulls will result in a null in the output. | |
521 | ||
522 | Returns | |
523 | ------- | |
524 | result : depends on inputs | |
525 | ||
526 | Examples | |
527 | -------- | |
528 | >>> import pyarrow as pa | |
529 | >>> arr = pa.array(["a", "b", "c", None, "e"]) | |
530 | >>> mask = pa.array([True, False, None, False, True]) | |
531 | >>> arr.filter(mask) | |
532 | <pyarrow.lib.StringArray object at 0x7fa826df9200> | |
533 | [ | |
534 | "a", | |
535 | "e" | |
536 | ] | |
537 | >>> arr.filter(mask, null_selection_behavior='emit_null') | |
538 | <pyarrow.lib.StringArray object at 0x7fa826df9200> | |
539 | [ | |
540 | "a", | |
541 | null, | |
542 | "e" | |
543 | ] | |
544 | """ | |
545 | options = FilterOptions(null_selection_behavior) | |
546 | return call_function('filter', [data, mask], options) | |
547 | ||
548 | ||
549 | def index(data, value, start=None, end=None, *, memory_pool=None): | |
550 | """ | |
551 | Find the index of the first occurrence of a given value. | |
552 | ||
553 | Parameters | |
554 | ---------- | |
555 | data : Array or ChunkedArray | |
556 | value : Scalar-like object | |
557 | start : int, optional | |
558 | end : int, optional | |
559 | memory_pool : MemoryPool, optional | |
560 | If not passed, will allocate memory from the default memory pool. | |
561 | ||
562 | Returns | |
563 | ------- | |
564 | index : the index, or -1 if not found | |
565 | """ | |
566 | if start is not None: | |
567 | if end is not None: | |
568 | data = data.slice(start, end - start) | |
569 | else: | |
570 | data = data.slice(start) | |
571 | elif end is not None: | |
572 | data = data.slice(0, end) | |
573 | ||
574 | if not isinstance(value, pa.Scalar): | |
575 | value = pa.scalar(value, type=data.type) | |
576 | elif data.type != value.type: | |
577 | value = pa.scalar(value.as_py(), type=data.type) | |
578 | options = IndexOptions(value=value) | |
579 | result = call_function('index', [data], options, memory_pool) | |
580 | if start is not None and result.as_py() >= 0: | |
581 | result = pa.scalar(result.as_py() + start, type=pa.int64()) | |
582 | return result | |
583 | ||
584 | ||
585 | def take(data, indices, *, boundscheck=True, memory_pool=None): | |
586 | """ | |
587 | Select values (or records) from array- or table-like data given integer | |
588 | selection indices. | |
589 | ||
590 | The result will be of the same type(s) as the input, with elements taken | |
591 | from the input array (or record batch / table fields) at the given | |
592 | indices. If an index is null then the corresponding value in the output | |
593 | will be null. | |
594 | ||
595 | Parameters | |
596 | ---------- | |
597 | data : Array, ChunkedArray, RecordBatch, or Table | |
598 | indices : Array, ChunkedArray | |
599 | Must be of integer type | |
600 | boundscheck : boolean, default True | |
601 | Whether to boundscheck the indices. If False and there is an out of | |
602 | bounds index, will likely cause the process to crash. | |
603 | memory_pool : MemoryPool, optional | |
604 | If not passed, will allocate memory from the default memory pool. | |
605 | ||
606 | Returns | |
607 | ------- | |
608 | result : depends on inputs | |
609 | ||
610 | Examples | |
611 | -------- | |
612 | >>> import pyarrow as pa | |
613 | >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) | |
614 | >>> indices = pa.array([0, None, 4, 3]) | |
615 | >>> arr.take(indices) | |
616 | <pyarrow.lib.StringArray object at 0x7ffa4fc7d368> | |
617 | [ | |
618 | "a", | |
619 | null, | |
620 | "e", | |
621 | null | |
622 | ] | |
623 | """ | |
624 | options = TakeOptions(boundscheck=boundscheck) | |
625 | return call_function('take', [data, indices], options, memory_pool) | |
626 | ||
627 | ||
628 | def fill_null(values, fill_value): | |
629 | """ | |
630 | Replace each null element in values with fill_value. The fill_value must be | |
631 | the same type as values or able to be implicitly casted to the array's | |
632 | type. | |
633 | ||
634 | This is an alias for :func:`coalesce`. | |
635 | ||
636 | Parameters | |
637 | ---------- | |
638 | values : Array, ChunkedArray, or Scalar-like object | |
639 | Each null element is replaced with the corresponding value | |
640 | from fill_value. | |
641 | fill_value : Array, ChunkedArray, or Scalar-like object | |
642 | If not same type as data will attempt to cast. | |
643 | ||
644 | Returns | |
645 | ------- | |
646 | result : depends on inputs | |
647 | ||
648 | Examples | |
649 | -------- | |
650 | >>> import pyarrow as pa | |
651 | >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) | |
652 | >>> fill_value = pa.scalar(5, type=pa.int8()) | |
653 | >>> arr.fill_null(fill_value) | |
654 | pyarrow.lib.Int8Array object at 0x7f95437f01a0> | |
655 | [ | |
656 | 1, | |
657 | 2, | |
658 | 5, | |
659 | 3 | |
660 | ] | |
661 | """ | |
662 | if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)): | |
663 | fill_value = pa.scalar(fill_value, type=values.type) | |
664 | elif values.type != fill_value.type: | |
665 | fill_value = pa.scalar(fill_value.as_py(), type=values.type) | |
666 | ||
667 | return call_function("coalesce", [values, fill_value]) | |
668 | ||
669 | ||
670 | def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): | |
671 | """ | |
672 | Select the indices of the top-k ordered elements from array- or table-like | |
673 | data. | |
674 | ||
675 | This is a specialization for :func:`select_k_unstable`. Output is not | |
676 | guaranteed to be stable. | |
677 | ||
678 | Parameters | |
679 | ---------- | |
680 | values : Array, ChunkedArray, RecordBatch, or Table | |
681 | Data to sort and get top indices from. | |
682 | k : int | |
683 | The number of `k` elements to keep. | |
684 | sort_keys : List-like | |
685 | Column key names to order by when input is table-like data. | |
686 | memory_pool : MemoryPool, optional | |
687 | If not passed, will allocate memory from the default memory pool. | |
688 | ||
689 | Returns | |
690 | ------- | |
691 | result : Array of indices | |
692 | ||
693 | Examples | |
694 | -------- | |
695 | >>> import pyarrow as pa | |
696 | >>> import pyarrow.compute as pc | |
697 | >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) | |
698 | >>> pc.top_k_unstable(arr, k=3) | |
699 | <pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30> | |
700 | [ | |
701 | 5, | |
702 | 4, | |
703 | 2 | |
704 | ] | |
705 | """ | |
706 | if sort_keys is None: | |
707 | sort_keys = [] | |
708 | if isinstance(values, (pa.Array, pa.ChunkedArray)): | |
709 | sort_keys.append(("dummy", "descending")) | |
710 | else: | |
711 | sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys) | |
712 | options = SelectKOptions(k, sort_keys) | |
713 | return call_function("select_k_unstable", [values], options, memory_pool) | |
714 | ||
715 | ||
716 | def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): | |
717 | """ | |
718 | Select the indices of the bottom-k ordered elements from | |
719 | array- or table-like data. | |
720 | ||
721 | This is a specialization for :func:`select_k_unstable`. Output is not | |
722 | guaranteed to be stable. | |
723 | ||
724 | Parameters | |
725 | ---------- | |
726 | values : Array, ChunkedArray, RecordBatch, or Table | |
727 | Data to sort and get bottom indices from. | |
728 | k : int | |
729 | The number of `k` elements to keep. | |
730 | sort_keys : List-like | |
731 | Column key names to order by when input is table-like data. | |
732 | memory_pool : MemoryPool, optional | |
733 | If not passed, will allocate memory from the default memory pool. | |
734 | ||
735 | Returns | |
736 | ------- | |
737 | result : Array of indices | |
738 | ||
739 | Examples | |
740 | -------- | |
741 | >>> import pyarrow as pa | |
742 | >>> import pyarrow.compute as pc | |
743 | >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) | |
744 | >>> pc.bottom_k_unstable(arr, k=3) | |
745 | <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0> | |
746 | [ | |
747 | 0, | |
748 | 1, | |
749 | 2 | |
750 | ] | |
751 | """ | |
752 | if sort_keys is None: | |
753 | sort_keys = [] | |
754 | if isinstance(values, (pa.Array, pa.ChunkedArray)): | |
755 | sort_keys.append(("dummy", "ascending")) | |
756 | else: | |
757 | sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) | |
758 | options = SelectKOptions(k, sort_keys) | |
759 | return call_function("select_k_unstable", [values], options, memory_pool) |