]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | // Eager evaluation convenience APIs for invoking common functions, including | |
19 | // necessary memory allocations | |
20 | ||
21 | #pragma once | |
22 | ||
23 | #include "arrow/compute/function.h" | |
24 | #include "arrow/datum.h" | |
25 | #include "arrow/result.h" | |
26 | #include "arrow/util/macros.h" | |
27 | #include "arrow/util/visibility.h" | |
28 | ||
29 | namespace arrow { | |
30 | ||
31 | class Array; | |
32 | ||
33 | namespace compute { | |
34 | ||
35 | class ExecContext; | |
36 | ||
37 | // ---------------------------------------------------------------------- | |
38 | // Aggregate functions | |
39 | ||
40 | /// \addtogroup compute-concrete-options | |
41 | /// @{ | |
42 | ||
43 | /// \brief Control general scalar aggregate kernel behavior | |
44 | /// | |
45 | /// By default, null values are ignored (skip_nulls = true). | |
46 | class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { | |
47 | public: | |
48 | explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); | |
49 | constexpr static char const kTypeName[] = "ScalarAggregateOptions"; | |
50 | static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } | |
51 | ||
52 | /// If true (the default), null values are ignored. Otherwise, if any value is null, | |
53 | /// emit null. | |
54 | bool skip_nulls; | |
55 | /// If less than this many non-null values are observed, emit null. | |
56 | uint32_t min_count; | |
57 | }; | |
58 | ||
59 | /// \brief Control count aggregate kernel behavior. | |
60 | /// | |
61 | /// By default, only non-null values are counted. | |
62 | class ARROW_EXPORT CountOptions : public FunctionOptions { | |
63 | public: | |
64 | enum CountMode { | |
65 | /// Count only non-null values. | |
66 | ONLY_VALID = 0, | |
67 | /// Count only null values. | |
68 | ONLY_NULL, | |
69 | /// Count both non-null and null values. | |
70 | ALL, | |
71 | }; | |
72 | explicit CountOptions(CountMode mode = CountMode::ONLY_VALID); | |
73 | constexpr static char const kTypeName[] = "CountOptions"; | |
74 | static CountOptions Defaults() { return CountOptions{}; } | |
75 | ||
76 | CountMode mode; | |
77 | }; | |
78 | ||
79 | /// \brief Control Mode kernel behavior | |
80 | /// | |
81 | /// Returns top-n common values and counts. | |
82 | /// By default, returns the most common value and count. | |
83 | class ARROW_EXPORT ModeOptions : public FunctionOptions { | |
84 | public: | |
85 | explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); | |
86 | constexpr static char const kTypeName[] = "ModeOptions"; | |
87 | static ModeOptions Defaults() { return ModeOptions{}; } | |
88 | ||
89 | int64_t n = 1; | |
90 | /// If true (the default), null values are ignored. Otherwise, if any value is null, | |
91 | /// emit null. | |
92 | bool skip_nulls; | |
93 | /// If less than this many non-null values are observed, emit null. | |
94 | uint32_t min_count; | |
95 | }; | |
96 | ||
97 | /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel | |
98 | /// | |
99 | /// The divisor used in calculations is N - ddof, where N is the number of elements. | |
100 | /// By default, ddof is zero, and population variance or stddev is returned. | |
101 | class ARROW_EXPORT VarianceOptions : public FunctionOptions { | |
102 | public: | |
103 | explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0); | |
104 | constexpr static char const kTypeName[] = "VarianceOptions"; | |
105 | static VarianceOptions Defaults() { return VarianceOptions{}; } | |
106 | ||
107 | int ddof = 0; | |
108 | /// If true (the default), null values are ignored. Otherwise, if any value is null, | |
109 | /// emit null. | |
110 | bool skip_nulls; | |
111 | /// If less than this many non-null values are observed, emit null. | |
112 | uint32_t min_count; | |
113 | }; | |
114 | ||
115 | /// \brief Control Quantile kernel behavior | |
116 | /// | |
117 | /// By default, returns the median value. | |
118 | class ARROW_EXPORT QuantileOptions : public FunctionOptions { | |
119 | public: | |
120 | /// Interpolation method to use when quantile lies between two data points | |
121 | enum Interpolation { | |
122 | LINEAR = 0, | |
123 | LOWER, | |
124 | HIGHER, | |
125 | NEAREST, | |
126 | MIDPOINT, | |
127 | }; | |
128 | ||
129 | explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR, | |
130 | bool skip_nulls = true, uint32_t min_count = 0); | |
131 | ||
132 | explicit QuantileOptions(std::vector<double> q, | |
133 | enum Interpolation interpolation = LINEAR, | |
134 | bool skip_nulls = true, uint32_t min_count = 0); | |
135 | ||
136 | constexpr static char const kTypeName[] = "QuantileOptions"; | |
137 | static QuantileOptions Defaults() { return QuantileOptions{}; } | |
138 | ||
139 | /// quantile must be between 0 and 1 inclusive | |
140 | std::vector<double> q; | |
141 | enum Interpolation interpolation; | |
142 | /// If true (the default), null values are ignored. Otherwise, if any value is null, | |
143 | /// emit null. | |
144 | bool skip_nulls; | |
145 | /// If less than this many non-null values are observed, emit null. | |
146 | uint32_t min_count; | |
147 | }; | |
148 | ||
149 | /// \brief Control TDigest approximate quantile kernel behavior | |
150 | /// | |
151 | /// By default, returns the median value. | |
152 | class ARROW_EXPORT TDigestOptions : public FunctionOptions { | |
153 | public: | |
154 | explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, | |
155 | uint32_t buffer_size = 500, bool skip_nulls = true, | |
156 | uint32_t min_count = 0); | |
157 | explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100, | |
158 | uint32_t buffer_size = 500, bool skip_nulls = true, | |
159 | uint32_t min_count = 0); | |
160 | constexpr static char const kTypeName[] = "TDigestOptions"; | |
161 | static TDigestOptions Defaults() { return TDigestOptions{}; } | |
162 | ||
163 | /// quantile must be between 0 and 1 inclusive | |
164 | std::vector<double> q; | |
165 | /// compression parameter, default 100 | |
166 | uint32_t delta; | |
167 | /// input buffer size, default 500 | |
168 | uint32_t buffer_size; | |
169 | /// If true (the default), null values are ignored. Otherwise, if any value is null, | |
170 | /// emit null. | |
171 | bool skip_nulls; | |
172 | /// If less than this many non-null values are observed, emit null. | |
173 | uint32_t min_count; | |
174 | }; | |
175 | ||
176 | /// \brief Control Index kernel behavior | |
177 | class ARROW_EXPORT IndexOptions : public FunctionOptions { | |
178 | public: | |
179 | explicit IndexOptions(std::shared_ptr<Scalar> value); | |
180 | // Default constructor for serialization | |
181 | IndexOptions(); | |
182 | constexpr static char const kTypeName[] = "IndexOptions"; | |
183 | ||
184 | std::shared_ptr<Scalar> value; | |
185 | }; | |
186 | ||
187 | /// @} | |
188 | ||
189 | /// \brief Count values in an array. | |
190 | /// | |
191 | /// \param[in] options counting options, see CountOptions for more information | |
192 | /// \param[in] datum to count | |
193 | /// \param[in] ctx the function execution context, optional | |
194 | /// \return out resulting datum | |
195 | /// | |
196 | /// \since 1.0.0 | |
197 | /// \note API not yet finalized | |
198 | ARROW_EXPORT | |
199 | Result<Datum> Count(const Datum& datum, | |
200 | const CountOptions& options = CountOptions::Defaults(), | |
201 | ExecContext* ctx = NULLPTR); | |
202 | ||
203 | /// \brief Compute the mean of a numeric array. | |
204 | /// | |
205 | /// \param[in] value datum to compute the mean, expecting Array | |
206 | /// \param[in] options see ScalarAggregateOptions for more information | |
207 | /// \param[in] ctx the function execution context, optional | |
208 | /// \return datum of the computed mean as a DoubleScalar | |
209 | /// | |
210 | /// \since 1.0.0 | |
211 | /// \note API not yet finalized | |
212 | ARROW_EXPORT | |
213 | Result<Datum> Mean( | |
214 | const Datum& value, | |
215 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
216 | ExecContext* ctx = NULLPTR); | |
217 | ||
218 | /// \brief Compute the product of values of a numeric array. | |
219 | /// | |
220 | /// \param[in] value datum to compute product of, expecting Array or ChunkedArray | |
221 | /// \param[in] options see ScalarAggregateOptions for more information | |
222 | /// \param[in] ctx the function execution context, optional | |
223 | /// \return datum of the computed sum as a Scalar | |
224 | /// | |
225 | /// \since 6.0.0 | |
226 | /// \note API not yet finalized | |
227 | ARROW_EXPORT | |
228 | Result<Datum> Product( | |
229 | const Datum& value, | |
230 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
231 | ExecContext* ctx = NULLPTR); | |
232 | ||
233 | /// \brief Sum values of a numeric array. | |
234 | /// | |
235 | /// \param[in] value datum to sum, expecting Array or ChunkedArray | |
236 | /// \param[in] options see ScalarAggregateOptions for more information | |
237 | /// \param[in] ctx the function execution context, optional | |
238 | /// \return datum of the computed sum as a Scalar | |
239 | /// | |
240 | /// \since 1.0.0 | |
241 | /// \note API not yet finalized | |
242 | ARROW_EXPORT | |
243 | Result<Datum> Sum( | |
244 | const Datum& value, | |
245 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
246 | ExecContext* ctx = NULLPTR); | |
247 | ||
248 | /// \brief Calculate the min / max of a numeric array | |
249 | /// | |
250 | /// This function returns both the min and max as a struct scalar, with type | |
251 | /// struct<min: T, max: T>, where T is the input type | |
252 | /// | |
253 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
254 | /// \param[in] options see ScalarAggregateOptions for more information | |
255 | /// \param[in] ctx the function execution context, optional | |
256 | /// \return resulting datum as a struct<min: T, max: T> scalar | |
257 | /// | |
258 | /// \since 1.0.0 | |
259 | /// \note API not yet finalized | |
260 | ARROW_EXPORT | |
261 | Result<Datum> MinMax( | |
262 | const Datum& value, | |
263 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
264 | ExecContext* ctx = NULLPTR); | |
265 | ||
266 | /// \brief Test whether any element in a boolean array evaluates to true. | |
267 | /// | |
268 | /// This function returns true if any of the elements in the array evaluates | |
269 | /// to true and false otherwise. Null values are ignored by default. | |
270 | /// If null values are taken into account by setting ScalarAggregateOptions | |
271 | /// parameter skip_nulls = false then Kleene logic is used. | |
272 | /// See KleeneOr for more details on Kleene logic. | |
273 | /// | |
274 | /// \param[in] value input datum, expecting a boolean array | |
275 | /// \param[in] options see ScalarAggregateOptions for more information | |
276 | /// \param[in] ctx the function execution context, optional | |
277 | /// \return resulting datum as a BooleanScalar | |
278 | /// | |
279 | /// \since 3.0.0 | |
280 | /// \note API not yet finalized | |
281 | ARROW_EXPORT | |
282 | Result<Datum> Any( | |
283 | const Datum& value, | |
284 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
285 | ExecContext* ctx = NULLPTR); | |
286 | ||
287 | /// \brief Test whether all elements in a boolean array evaluate to true. | |
288 | /// | |
289 | /// This function returns true if all of the elements in the array evaluate | |
290 | /// to true and false otherwise. Null values are ignored by default. | |
291 | /// If null values are taken into account by setting ScalarAggregateOptions | |
292 | /// parameter skip_nulls = false then Kleene logic is used. | |
293 | /// See KleeneAnd for more details on Kleene logic. | |
294 | /// | |
295 | /// \param[in] value input datum, expecting a boolean array | |
296 | /// \param[in] options see ScalarAggregateOptions for more information | |
297 | /// \param[in] ctx the function execution context, optional | |
298 | /// \return resulting datum as a BooleanScalar | |
299 | ||
300 | /// \since 3.0.0 | |
301 | /// \note API not yet finalized | |
302 | ARROW_EXPORT | |
303 | Result<Datum> All( | |
304 | const Datum& value, | |
305 | const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), | |
306 | ExecContext* ctx = NULLPTR); | |
307 | ||
308 | /// \brief Calculate the modal (most common) value of a numeric array | |
309 | /// | |
310 | /// This function returns top-n most common values and number of times they occur as | |
311 | /// an array of `struct<mode: T, count: int64>`, where T is the input type. | |
312 | /// Values with larger counts are returned before smaller ones. | |
313 | /// If there are more than one values with same count, smaller value is returned first. | |
314 | /// | |
315 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
316 | /// \param[in] options see ModeOptions for more information | |
317 | /// \param[in] ctx the function execution context, optional | |
318 | /// \return resulting datum as an array of struct<mode: T, count: int64> | |
319 | /// | |
320 | /// \since 2.0.0 | |
321 | /// \note API not yet finalized | |
322 | ARROW_EXPORT | |
323 | Result<Datum> Mode(const Datum& value, | |
324 | const ModeOptions& options = ModeOptions::Defaults(), | |
325 | ExecContext* ctx = NULLPTR); | |
326 | ||
327 | /// \brief Calculate the standard deviation of a numeric array | |
328 | /// | |
329 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
330 | /// \param[in] options see VarianceOptions for more information | |
331 | /// \param[in] ctx the function execution context, optional | |
332 | /// \return datum of the computed standard deviation as a DoubleScalar | |
333 | /// | |
334 | /// \since 2.0.0 | |
335 | /// \note API not yet finalized | |
336 | ARROW_EXPORT | |
337 | Result<Datum> Stddev(const Datum& value, | |
338 | const VarianceOptions& options = VarianceOptions::Defaults(), | |
339 | ExecContext* ctx = NULLPTR); | |
340 | ||
341 | /// \brief Calculate the variance of a numeric array | |
342 | /// | |
343 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
344 | /// \param[in] options see VarianceOptions for more information | |
345 | /// \param[in] ctx the function execution context, optional | |
346 | /// \return datum of the computed variance as a DoubleScalar | |
347 | /// | |
348 | /// \since 2.0.0 | |
349 | /// \note API not yet finalized | |
350 | ARROW_EXPORT | |
351 | Result<Datum> Variance(const Datum& value, | |
352 | const VarianceOptions& options = VarianceOptions::Defaults(), | |
353 | ExecContext* ctx = NULLPTR); | |
354 | ||
355 | /// \brief Calculate the quantiles of a numeric array | |
356 | /// | |
357 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
358 | /// \param[in] options see QuantileOptions for more information | |
359 | /// \param[in] ctx the function execution context, optional | |
360 | /// \return resulting datum as an array | |
361 | /// | |
362 | /// \since 4.0.0 | |
363 | /// \note API not yet finalized | |
364 | ARROW_EXPORT | |
365 | Result<Datum> Quantile(const Datum& value, | |
366 | const QuantileOptions& options = QuantileOptions::Defaults(), | |
367 | ExecContext* ctx = NULLPTR); | |
368 | ||
369 | /// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm | |
370 | /// | |
371 | /// \param[in] value input datum, expecting Array or ChunkedArray | |
372 | /// \param[in] options see TDigestOptions for more information | |
373 | /// \param[in] ctx the function execution context, optional | |
374 | /// \return resulting datum as an array | |
375 | /// | |
376 | /// \since 4.0.0 | |
377 | /// \note API not yet finalized | |
378 | ARROW_EXPORT | |
379 | Result<Datum> TDigest(const Datum& value, | |
380 | const TDigestOptions& options = TDigestOptions::Defaults(), | |
381 | ExecContext* ctx = NULLPTR); | |
382 | ||
383 | /// \brief Find the first index of a value in an array. | |
384 | /// | |
385 | /// \param[in] value The array to search. | |
386 | /// \param[in] options The array to search for. See IndexOoptions. | |
387 | /// \param[in] ctx the function execution context, optional | |
388 | /// \return out a Scalar containing the index (or -1 if not found). | |
389 | /// | |
390 | /// \since 5.0.0 | |
391 | /// \note API not yet finalized | |
392 | ARROW_EXPORT | |
393 | Result<Datum> Index(const Datum& value, const IndexOptions& options, | |
394 | ExecContext* ctx = NULLPTR); | |
395 | ||
396 | namespace internal { | |
397 | ||
398 | /// Internal use only: streaming group identifier. | |
399 | /// Consumes batches of keys and yields batches of the group ids. | |
400 | class ARROW_EXPORT Grouper { | |
401 | public: | |
402 | virtual ~Grouper() = default; | |
403 | ||
404 | /// Construct a Grouper which receives the specified key types | |
405 | static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs, | |
406 | ExecContext* ctx = default_exec_context()); | |
407 | ||
408 | /// Consume a batch of keys, producing the corresponding group ids as an integer array. | |
409 | /// Currently only uint32 indices will be produced, eventually the bit width will only | |
410 | /// be as wide as necessary. | |
411 | virtual Result<Datum> Consume(const ExecBatch& batch) = 0; | |
412 | ||
413 | /// Get current unique keys. May be called multiple times. | |
414 | virtual Result<ExecBatch> GetUniques() = 0; | |
415 | ||
416 | /// Get the current number of groups. | |
417 | virtual uint32_t num_groups() const = 0; | |
418 | ||
419 | /// \brief Assemble lists of indices of identical elements. | |
420 | /// | |
421 | /// \param[in] ids An unsigned, all-valid integral array which will be | |
422 | /// used as grouping criteria. | |
423 | /// \param[in] num_groups An upper bound for the elements of ids | |
424 | /// \return A num_groups-long ListArray where the slot at i contains a | |
425 | /// list of indices where i appears in ids. | |
426 | /// | |
427 | /// MakeGroupings([ | |
428 | /// 2, | |
429 | /// 2, | |
430 | /// 5, | |
431 | /// 5, | |
432 | /// 2, | |
433 | /// 3 | |
434 | /// ], 8) == [ | |
435 | /// [], | |
436 | /// [], | |
437 | /// [0, 1, 4], | |
438 | /// [5], | |
439 | /// [], | |
440 | /// [2, 3], | |
441 | /// [], | |
442 | /// [] | |
443 | /// ] | |
444 | static Result<std::shared_ptr<ListArray>> MakeGroupings( | |
445 | const UInt32Array& ids, uint32_t num_groups, | |
446 | ExecContext* ctx = default_exec_context()); | |
447 | ||
448 | /// \brief Produce a ListArray whose slots are selections of `array` which correspond to | |
449 | /// the provided groupings. | |
450 | /// | |
451 | /// For example, | |
452 | /// ApplyGroupings([ | |
453 | /// [], | |
454 | /// [], | |
455 | /// [0, 1, 4], | |
456 | /// [5], | |
457 | /// [], | |
458 | /// [2, 3], | |
459 | /// [], | |
460 | /// [] | |
461 | /// ], [2, 2, 5, 5, 2, 3]) == [ | |
462 | /// [], | |
463 | /// [], | |
464 | /// [2, 2, 2], | |
465 | /// [3], | |
466 | /// [], | |
467 | /// [5, 5], | |
468 | /// [], | |
469 | /// [] | |
470 | /// ] | |
471 | static Result<std::shared_ptr<ListArray>> ApplyGroupings( | |
472 | const ListArray& groupings, const Array& array, | |
473 | ExecContext* ctx = default_exec_context()); | |
474 | }; | |
475 | ||
476 | /// \brief Configure a grouped aggregation | |
477 | struct ARROW_EXPORT Aggregate { | |
478 | /// the name of the aggregation function | |
479 | std::string function; | |
480 | ||
481 | /// options for the aggregation function | |
482 | const FunctionOptions* options; | |
483 | }; | |
484 | ||
485 | /// Internal use only: helper function for testing HashAggregateKernels. | |
486 | /// This will be replaced by streaming execution operators. | |
487 | ARROW_EXPORT | |
488 | Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys, | |
489 | const std::vector<Aggregate>& aggregates, bool use_threads = false, | |
490 | ExecContext* ctx = default_exec_context()); | |
491 | ||
492 | } // namespace internal | |
493 | } // namespace compute | |
494 | } // namespace arrow |