]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/compute/api_aggregate.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / compute / api_aggregate.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// Eager evaluation convenience APIs for invoking common functions, including
19// necessary memory allocations
20
21#pragma once
22
23#include "arrow/compute/function.h"
24#include "arrow/datum.h"
25#include "arrow/result.h"
26#include "arrow/util/macros.h"
27#include "arrow/util/visibility.h"
28
29namespace arrow {
30
31class Array;
32
33namespace compute {
34
35class ExecContext;
36
37// ----------------------------------------------------------------------
38// Aggregate functions
39
40/// \addtogroup compute-concrete-options
41/// @{
42
43/// \brief Control general scalar aggregate kernel behavior
44///
45/// By default, null values are ignored (skip_nulls = true).
46class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
47 public:
48 explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
49 constexpr static char const kTypeName[] = "ScalarAggregateOptions";
50 static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
51
52 /// If true (the default), null values are ignored. Otherwise, if any value is null,
53 /// emit null.
54 bool skip_nulls;
55 /// If less than this many non-null values are observed, emit null.
56 uint32_t min_count;
57};
58
59/// \brief Control count aggregate kernel behavior.
60///
61/// By default, only non-null values are counted.
62class ARROW_EXPORT CountOptions : public FunctionOptions {
63 public:
64 enum CountMode {
65 /// Count only non-null values.
66 ONLY_VALID = 0,
67 /// Count only null values.
68 ONLY_NULL,
69 /// Count both non-null and null values.
70 ALL,
71 };
72 explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
73 constexpr static char const kTypeName[] = "CountOptions";
74 static CountOptions Defaults() { return CountOptions{}; }
75
76 CountMode mode;
77};
78
79/// \brief Control Mode kernel behavior
80///
81/// Returns top-n common values and counts.
82/// By default, returns the most common value and count.
83class ARROW_EXPORT ModeOptions : public FunctionOptions {
84 public:
85 explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
86 constexpr static char const kTypeName[] = "ModeOptions";
87 static ModeOptions Defaults() { return ModeOptions{}; }
88
89 int64_t n = 1;
90 /// If true (the default), null values are ignored. Otherwise, if any value is null,
91 /// emit null.
92 bool skip_nulls;
93 /// If less than this many non-null values are observed, emit null.
94 uint32_t min_count;
95};
96
97/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
98///
99/// The divisor used in calculations is N - ddof, where N is the number of elements.
100/// By default, ddof is zero, and population variance or stddev is returned.
101class ARROW_EXPORT VarianceOptions : public FunctionOptions {
102 public:
103 explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
104 constexpr static char const kTypeName[] = "VarianceOptions";
105 static VarianceOptions Defaults() { return VarianceOptions{}; }
106
107 int ddof = 0;
108 /// If true (the default), null values are ignored. Otherwise, if any value is null,
109 /// emit null.
110 bool skip_nulls;
111 /// If less than this many non-null values are observed, emit null.
112 uint32_t min_count;
113};
114
115/// \brief Control Quantile kernel behavior
116///
117/// By default, returns the median value.
118class ARROW_EXPORT QuantileOptions : public FunctionOptions {
119 public:
120 /// Interpolation method to use when quantile lies between two data points
121 enum Interpolation {
122 LINEAR = 0,
123 LOWER,
124 HIGHER,
125 NEAREST,
126 MIDPOINT,
127 };
128
129 explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
130 bool skip_nulls = true, uint32_t min_count = 0);
131
132 explicit QuantileOptions(std::vector<double> q,
133 enum Interpolation interpolation = LINEAR,
134 bool skip_nulls = true, uint32_t min_count = 0);
135
136 constexpr static char const kTypeName[] = "QuantileOptions";
137 static QuantileOptions Defaults() { return QuantileOptions{}; }
138
139 /// quantile must be between 0 and 1 inclusive
140 std::vector<double> q;
141 enum Interpolation interpolation;
142 /// If true (the default), null values are ignored. Otherwise, if any value is null,
143 /// emit null.
144 bool skip_nulls;
145 /// If less than this many non-null values are observed, emit null.
146 uint32_t min_count;
147};
148
149/// \brief Control TDigest approximate quantile kernel behavior
150///
151/// By default, returns the median value.
152class ARROW_EXPORT TDigestOptions : public FunctionOptions {
153 public:
154 explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
155 uint32_t buffer_size = 500, bool skip_nulls = true,
156 uint32_t min_count = 0);
157 explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
158 uint32_t buffer_size = 500, bool skip_nulls = true,
159 uint32_t min_count = 0);
160 constexpr static char const kTypeName[] = "TDigestOptions";
161 static TDigestOptions Defaults() { return TDigestOptions{}; }
162
163 /// quantile must be between 0 and 1 inclusive
164 std::vector<double> q;
165 /// compression parameter, default 100
166 uint32_t delta;
167 /// input buffer size, default 500
168 uint32_t buffer_size;
169 /// If true (the default), null values are ignored. Otherwise, if any value is null,
170 /// emit null.
171 bool skip_nulls;
172 /// If less than this many non-null values are observed, emit null.
173 uint32_t min_count;
174};
175
176/// \brief Control Index kernel behavior
177class ARROW_EXPORT IndexOptions : public FunctionOptions {
178 public:
179 explicit IndexOptions(std::shared_ptr<Scalar> value);
180 // Default constructor for serialization
181 IndexOptions();
182 constexpr static char const kTypeName[] = "IndexOptions";
183
184 std::shared_ptr<Scalar> value;
185};
186
187/// @}
188
189/// \brief Count values in an array.
190///
191/// \param[in] options counting options, see CountOptions for more information
192/// \param[in] datum to count
193/// \param[in] ctx the function execution context, optional
194/// \return out resulting datum
195///
196/// \since 1.0.0
197/// \note API not yet finalized
198ARROW_EXPORT
199Result<Datum> Count(const Datum& datum,
200 const CountOptions& options = CountOptions::Defaults(),
201 ExecContext* ctx = NULLPTR);
202
203/// \brief Compute the mean of a numeric array.
204///
205/// \param[in] value datum to compute the mean, expecting Array
206/// \param[in] options see ScalarAggregateOptions for more information
207/// \param[in] ctx the function execution context, optional
208/// \return datum of the computed mean as a DoubleScalar
209///
210/// \since 1.0.0
211/// \note API not yet finalized
212ARROW_EXPORT
213Result<Datum> Mean(
214 const Datum& value,
215 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
216 ExecContext* ctx = NULLPTR);
217
218/// \brief Compute the product of values of a numeric array.
219///
220/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
221/// \param[in] options see ScalarAggregateOptions for more information
222/// \param[in] ctx the function execution context, optional
223/// \return datum of the computed sum as a Scalar
224///
225/// \since 6.0.0
226/// \note API not yet finalized
227ARROW_EXPORT
228Result<Datum> Product(
229 const Datum& value,
230 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
231 ExecContext* ctx = NULLPTR);
232
233/// \brief Sum values of a numeric array.
234///
235/// \param[in] value datum to sum, expecting Array or ChunkedArray
236/// \param[in] options see ScalarAggregateOptions for more information
237/// \param[in] ctx the function execution context, optional
238/// \return datum of the computed sum as a Scalar
239///
240/// \since 1.0.0
241/// \note API not yet finalized
242ARROW_EXPORT
243Result<Datum> Sum(
244 const Datum& value,
245 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
246 ExecContext* ctx = NULLPTR);
247
248/// \brief Calculate the min / max of a numeric array
249///
250/// This function returns both the min and max as a struct scalar, with type
251/// struct<min: T, max: T>, where T is the input type
252///
253/// \param[in] value input datum, expecting Array or ChunkedArray
254/// \param[in] options see ScalarAggregateOptions for more information
255/// \param[in] ctx the function execution context, optional
256/// \return resulting datum as a struct<min: T, max: T> scalar
257///
258/// \since 1.0.0
259/// \note API not yet finalized
260ARROW_EXPORT
261Result<Datum> MinMax(
262 const Datum& value,
263 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
264 ExecContext* ctx = NULLPTR);
265
266/// \brief Test whether any element in a boolean array evaluates to true.
267///
268/// This function returns true if any of the elements in the array evaluates
269/// to true and false otherwise. Null values are ignored by default.
270/// If null values are taken into account by setting ScalarAggregateOptions
271/// parameter skip_nulls = false then Kleene logic is used.
272/// See KleeneOr for more details on Kleene logic.
273///
274/// \param[in] value input datum, expecting a boolean array
275/// \param[in] options see ScalarAggregateOptions for more information
276/// \param[in] ctx the function execution context, optional
277/// \return resulting datum as a BooleanScalar
278///
279/// \since 3.0.0
280/// \note API not yet finalized
281ARROW_EXPORT
282Result<Datum> Any(
283 const Datum& value,
284 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
285 ExecContext* ctx = NULLPTR);
286
287/// \brief Test whether all elements in a boolean array evaluate to true.
288///
289/// This function returns true if all of the elements in the array evaluate
290/// to true and false otherwise. Null values are ignored by default.
291/// If null values are taken into account by setting ScalarAggregateOptions
292/// parameter skip_nulls = false then Kleene logic is used.
293/// See KleeneAnd for more details on Kleene logic.
294///
295/// \param[in] value input datum, expecting a boolean array
296/// \param[in] options see ScalarAggregateOptions for more information
297/// \param[in] ctx the function execution context, optional
298/// \return resulting datum as a BooleanScalar
299
300/// \since 3.0.0
301/// \note API not yet finalized
302ARROW_EXPORT
303Result<Datum> All(
304 const Datum& value,
305 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
306 ExecContext* ctx = NULLPTR);
307
308/// \brief Calculate the modal (most common) value of a numeric array
309///
310/// This function returns top-n most common values and number of times they occur as
311/// an array of `struct<mode: T, count: int64>`, where T is the input type.
312/// Values with larger counts are returned before smaller ones.
313/// If there are more than one values with same count, smaller value is returned first.
314///
315/// \param[in] value input datum, expecting Array or ChunkedArray
316/// \param[in] options see ModeOptions for more information
317/// \param[in] ctx the function execution context, optional
318/// \return resulting datum as an array of struct<mode: T, count: int64>
319///
320/// \since 2.0.0
321/// \note API not yet finalized
322ARROW_EXPORT
323Result<Datum> Mode(const Datum& value,
324 const ModeOptions& options = ModeOptions::Defaults(),
325 ExecContext* ctx = NULLPTR);
326
327/// \brief Calculate the standard deviation of a numeric array
328///
329/// \param[in] value input datum, expecting Array or ChunkedArray
330/// \param[in] options see VarianceOptions for more information
331/// \param[in] ctx the function execution context, optional
332/// \return datum of the computed standard deviation as a DoubleScalar
333///
334/// \since 2.0.0
335/// \note API not yet finalized
336ARROW_EXPORT
337Result<Datum> Stddev(const Datum& value,
338 const VarianceOptions& options = VarianceOptions::Defaults(),
339 ExecContext* ctx = NULLPTR);
340
341/// \brief Calculate the variance of a numeric array
342///
343/// \param[in] value input datum, expecting Array or ChunkedArray
344/// \param[in] options see VarianceOptions for more information
345/// \param[in] ctx the function execution context, optional
346/// \return datum of the computed variance as a DoubleScalar
347///
348/// \since 2.0.0
349/// \note API not yet finalized
350ARROW_EXPORT
351Result<Datum> Variance(const Datum& value,
352 const VarianceOptions& options = VarianceOptions::Defaults(),
353 ExecContext* ctx = NULLPTR);
354
355/// \brief Calculate the quantiles of a numeric array
356///
357/// \param[in] value input datum, expecting Array or ChunkedArray
358/// \param[in] options see QuantileOptions for more information
359/// \param[in] ctx the function execution context, optional
360/// \return resulting datum as an array
361///
362/// \since 4.0.0
363/// \note API not yet finalized
364ARROW_EXPORT
365Result<Datum> Quantile(const Datum& value,
366 const QuantileOptions& options = QuantileOptions::Defaults(),
367 ExecContext* ctx = NULLPTR);
368
369/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
370///
371/// \param[in] value input datum, expecting Array or ChunkedArray
372/// \param[in] options see TDigestOptions for more information
373/// \param[in] ctx the function execution context, optional
374/// \return resulting datum as an array
375///
376/// \since 4.0.0
377/// \note API not yet finalized
378ARROW_EXPORT
379Result<Datum> TDigest(const Datum& value,
380 const TDigestOptions& options = TDigestOptions::Defaults(),
381 ExecContext* ctx = NULLPTR);
382
383/// \brief Find the first index of a value in an array.
384///
385/// \param[in] value The array to search.
386/// \param[in] options The array to search for. See IndexOoptions.
387/// \param[in] ctx the function execution context, optional
388/// \return out a Scalar containing the index (or -1 if not found).
389///
390/// \since 5.0.0
391/// \note API not yet finalized
392ARROW_EXPORT
393Result<Datum> Index(const Datum& value, const IndexOptions& options,
394 ExecContext* ctx = NULLPTR);
395
396namespace internal {
397
398/// Internal use only: streaming group identifier.
399/// Consumes batches of keys and yields batches of the group ids.
400class ARROW_EXPORT Grouper {
401 public:
402 virtual ~Grouper() = default;
403
404 /// Construct a Grouper which receives the specified key types
405 static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
406 ExecContext* ctx = default_exec_context());
407
408 /// Consume a batch of keys, producing the corresponding group ids as an integer array.
409 /// Currently only uint32 indices will be produced, eventually the bit width will only
410 /// be as wide as necessary.
411 virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
412
413 /// Get current unique keys. May be called multiple times.
414 virtual Result<ExecBatch> GetUniques() = 0;
415
416 /// Get the current number of groups.
417 virtual uint32_t num_groups() const = 0;
418
419 /// \brief Assemble lists of indices of identical elements.
420 ///
421 /// \param[in] ids An unsigned, all-valid integral array which will be
422 /// used as grouping criteria.
423 /// \param[in] num_groups An upper bound for the elements of ids
424 /// \return A num_groups-long ListArray where the slot at i contains a
425 /// list of indices where i appears in ids.
426 ///
427 /// MakeGroupings([
428 /// 2,
429 /// 2,
430 /// 5,
431 /// 5,
432 /// 2,
433 /// 3
434 /// ], 8) == [
435 /// [],
436 /// [],
437 /// [0, 1, 4],
438 /// [5],
439 /// [],
440 /// [2, 3],
441 /// [],
442 /// []
443 /// ]
444 static Result<std::shared_ptr<ListArray>> MakeGroupings(
445 const UInt32Array& ids, uint32_t num_groups,
446 ExecContext* ctx = default_exec_context());
447
448 /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
449 /// the provided groupings.
450 ///
451 /// For example,
452 /// ApplyGroupings([
453 /// [],
454 /// [],
455 /// [0, 1, 4],
456 /// [5],
457 /// [],
458 /// [2, 3],
459 /// [],
460 /// []
461 /// ], [2, 2, 5, 5, 2, 3]) == [
462 /// [],
463 /// [],
464 /// [2, 2, 2],
465 /// [3],
466 /// [],
467 /// [5, 5],
468 /// [],
469 /// []
470 /// ]
471 static Result<std::shared_ptr<ListArray>> ApplyGroupings(
472 const ListArray& groupings, const Array& array,
473 ExecContext* ctx = default_exec_context());
474};
475
476/// \brief Configure a grouped aggregation
477struct ARROW_EXPORT Aggregate {
478 /// the name of the aggregation function
479 std::string function;
480
481 /// options for the aggregation function
482 const FunctionOptions* options;
483};
484
485/// Internal use only: helper function for testing HashAggregateKernels.
486/// This will be replaced by streaming execution operators.
487ARROW_EXPORT
488Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
489 const std::vector<Aggregate>& aggregates, bool use_threads = false,
490 ExecContext* ctx = default_exec_context());
491
492} // namespace internal
493} // namespace compute
494} // namespace arrow