[ceph.git] / ceph / src / arrow / cpp / src / arrow / compute / api_aggregate.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// Eager evaluation convenience APIs for invoking common functions, including
// necessary memory allocations

#pragma once

#include "arrow/compute/function.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow {

class Array;

namespace compute {

class ExecContext;

// ----------------------------------------------------------------------
// Aggregate functions

/// \addtogroup compute-concrete-options
/// @{

/// \brief Control general scalar aggregate kernel behavior
///
/// By default, null values are ignored (skip_nulls = true).
class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
 public:
  explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
  constexpr static char const kTypeName[] = "ScalarAggregateOptions";
  static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }

  /// If true (the default), null values are ignored. Otherwise, if any value is null,
  /// emit null.
  bool skip_nulls;
  /// If less than this many non-null values are observed, emit null.
  uint32_t min_count;
};

/// \brief Control count aggregate kernel behavior.
///
/// By default, only non-null values are counted.
class ARROW_EXPORT CountOptions : public FunctionOptions {
 public:
  enum CountMode {
    /// Count only non-null values.
    ONLY_VALID = 0,
    /// Count only null values.
    ONLY_NULL,
    /// Count both non-null and null values.
    ALL,
  };
  explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
  constexpr static char const kTypeName[] = "CountOptions";
  static CountOptions Defaults() { return CountOptions{}; }

  CountMode mode;
};

/// \brief Control Mode kernel behavior
///
/// Returns top-n common values and counts.
/// By default, returns the most common value and count.
class ARROW_EXPORT ModeOptions : public FunctionOptions {
 public:
  explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
  constexpr static char const kTypeName[] = "ModeOptions";
  static ModeOptions Defaults() { return ModeOptions{}; }

  int64_t n = 1;
  /// If true (the default), null values are ignored. Otherwise, if any value is null,
  /// emit null.
  bool skip_nulls;
  /// If less than this many non-null values are observed, emit null.
  uint32_t min_count;
};

/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
///
/// The divisor used in calculations is N - ddof, where N is the number of elements.
/// By default, ddof is zero, and population variance or stddev is returned.
class ARROW_EXPORT VarianceOptions : public FunctionOptions {
 public:
  explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
  constexpr static char const kTypeName[] = "VarianceOptions";
  static VarianceOptions Defaults() { return VarianceOptions{}; }

  int ddof = 0;
  /// If true (the default), null values are ignored. Otherwise, if any value is null,
  /// emit null.
  bool skip_nulls;
  /// If less than this many non-null values are observed, emit null.
  uint32_t min_count;
};

/// \brief Control Quantile kernel behavior
///
/// By default, returns the median value.
class ARROW_EXPORT QuantileOptions : public FunctionOptions {
 public:
  /// Interpolation method to use when quantile lies between two data points
  enum Interpolation {
    LINEAR = 0,
    LOWER,
    HIGHER,
    NEAREST,
    MIDPOINT,
  };

  explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
                           bool skip_nulls = true, uint32_t min_count = 0);

  explicit QuantileOptions(std::vector<double> q,
                           enum Interpolation interpolation = LINEAR,
                           bool skip_nulls = true, uint32_t min_count = 0);

  constexpr static char const kTypeName[] = "QuantileOptions";
  static QuantileOptions Defaults() { return QuantileOptions{}; }

  /// quantile must be between 0 and 1 inclusive
  std::vector<double> q;
  enum Interpolation interpolation;
  /// If true (the default), null values are ignored. Otherwise, if any value is null,
  /// emit null.
  bool skip_nulls;
  /// If less than this many non-null values are observed, emit null.
  uint32_t min_count;
};

/// \brief Control TDigest approximate quantile kernel behavior
///
/// By default, returns the median value.
class ARROW_EXPORT TDigestOptions : public FunctionOptions {
 public:
  explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
                          uint32_t buffer_size = 500, bool skip_nulls = true,
                          uint32_t min_count = 0);
  explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
                          uint32_t buffer_size = 500, bool skip_nulls = true,
                          uint32_t min_count = 0);
  constexpr static char const kTypeName[] = "TDigestOptions";
  static TDigestOptions Defaults() { return TDigestOptions{}; }

  /// quantile must be between 0 and 1 inclusive
  std::vector<double> q;
  /// compression parameter, default 100
  uint32_t delta;
  /// input buffer size, default 500
  uint32_t buffer_size;
  /// If true (the default), null values are ignored. Otherwise, if any value is null,
  /// emit null.
  bool skip_nulls;
  /// If less than this many non-null values are observed, emit null.
  uint32_t min_count;
};

/// \brief Control Index kernel behavior
class ARROW_EXPORT IndexOptions : public FunctionOptions {
 public:
  explicit IndexOptions(std::shared_ptr<Scalar> value);
  // Default constructor for serialization
  IndexOptions();
  constexpr static char const kTypeName[] = "IndexOptions";

  std::shared_ptr<Scalar> value;
};

/// @}

/// \brief Count values in an array.
///
/// \param[in] options counting options, see CountOptions for more information
/// \param[in] datum to count
/// \param[in] ctx the function execution context, optional
/// \return out resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Count(const Datum& datum,
                    const CountOptions& options = CountOptions::Defaults(),
                    ExecContext* ctx = NULLPTR);

/// \brief Compute the mean of a numeric array.
///
/// \param[in] value datum to compute the mean, expecting Array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed mean as a DoubleScalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mean(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Compute the product of values of a numeric array.
///
/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 6.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Product(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Sum values of a numeric array.
///
/// \param[in] value datum to sum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Sum(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Calculate the min / max of a numeric array
///
/// This function returns both the min and max as a struct scalar, with type
/// struct<min: T, max: T>, where T is the input type
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a struct<min: T, max: T> scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> MinMax(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Test whether any element in a boolean array evaluates to true.
///
/// This function returns true if any of the elements in the array evaluates
/// to true and false otherwise. Null values are ignored by default.
/// If null values are taken into account by setting ScalarAggregateOptions
/// parameter skip_nulls = false then Kleene logic is used.
/// See KleeneOr for more details on Kleene logic.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar
///
/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Any(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Test whether all elements in a boolean array evaluate to true.
///
/// This function returns true if all of the elements in the array evaluate
/// to true and false otherwise. Null values are ignored by default.
/// If null values are taken into account by setting ScalarAggregateOptions
/// parameter skip_nulls = false then Kleene logic is used.
/// See KleeneAnd for more details on Kleene logic.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar

/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> All(
    const Datum& value,
    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
    ExecContext* ctx = NULLPTR);

/// \brief Calculate the modal (most common) value of a numeric array
///
/// This function returns top-n most common values and number of times they occur as
/// an array of `struct<mode: T, count: int64>`, where T is the input type.
/// Values with larger counts are returned before smaller ones.
/// If there are more than one values with same count, smaller value is returned first.
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ModeOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array of struct<mode: T, count: int64>
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mode(const Datum& value,
                   const ModeOptions& options = ModeOptions::Defaults(),
                   ExecContext* ctx = NULLPTR);

/// \brief Calculate the standard deviation of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed standard deviation as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Stddev(const Datum& value,
                     const VarianceOptions& options = VarianceOptions::Defaults(),
                     ExecContext* ctx = NULLPTR);

/// \brief Calculate the variance of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed variance as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Variance(const Datum& value,
                       const VarianceOptions& options = VarianceOptions::Defaults(),
                       ExecContext* ctx = NULLPTR);

/// \brief Calculate the quantiles of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see QuantileOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Quantile(const Datum& value,
                       const QuantileOptions& options = QuantileOptions::Defaults(),
                       ExecContext* ctx = NULLPTR);

/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see TDigestOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> TDigest(const Datum& value,
                      const TDigestOptions& options = TDigestOptions::Defaults(),
                      ExecContext* ctx = NULLPTR);

/// \brief Find the first index of a value in an array.
///
/// \param[in] value The array to search.
/// \param[in] options The array to search for. See IndexOoptions.
/// \param[in] ctx the function execution context, optional
/// \return out a Scalar containing the index (or -1 if not found).
///
/// \since 5.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Index(const Datum& value, const IndexOptions& options,
                    ExecContext* ctx = NULLPTR);

namespace internal {

/// Internal use only: streaming group identifier.
/// Consumes batches of keys and yields batches of the group ids.
class ARROW_EXPORT Grouper {
 public:
  virtual ~Grouper() = default;

  /// Construct a Grouper which receives the specified key types
  static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
                                               ExecContext* ctx = default_exec_context());

  /// Consume a batch of keys, producing the corresponding group ids as an integer array.
  /// Currently only uint32 indices will be produced, eventually the bit width will only
  /// be as wide as necessary.
  virtual Result<Datum> Consume(const ExecBatch& batch) = 0;

  /// Get current unique keys. May be called multiple times.
  virtual Result<ExecBatch> GetUniques() = 0;

  /// Get the current number of groups.
  virtual uint32_t num_groups() const = 0;

  /// \brief Assemble lists of indices of identical elements.
  ///
  /// \param[in] ids An unsigned, all-valid integral array which will be
  ///                used as grouping criteria.
  /// \param[in] num_groups An upper bound for the elements of ids
  /// \return A num_groups-long ListArray where the slot at i contains a
  ///         list of indices where i appears in ids.
  ///
  ///   MakeGroupings([
  ///       2,
  ///       2,
  ///       5,
  ///       5,
  ///       2,
  ///       3
  ///   ], 8) == [
  ///       [],
  ///       [],
  ///       [0, 1, 4],
  ///       [5],
  ///       [],
  ///       [2, 3],
  ///       [],
  ///       []
  ///   ]
  static Result<std::shared_ptr<ListArray>> MakeGroupings(
      const UInt32Array& ids, uint32_t num_groups,
      ExecContext* ctx = default_exec_context());

  /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
  /// the provided groupings.
  ///
  /// For example,
  ///   ApplyGroupings([
  ///       [],
  ///       [],
  ///       [0, 1, 4],
  ///       [5],
  ///       [],
  ///       [2, 3],
  ///       [],
  ///       []
  ///   ], [2, 2, 5, 5, 2, 3]) == [
  ///       [],
  ///       [],
  ///       [2, 2, 2],
  ///       [3],
  ///       [],
  ///       [5, 5],
  ///       [],
  ///       []
  ///   ]
  static Result<std::shared_ptr<ListArray>> ApplyGroupings(
      const ListArray& groupings, const Array& array,
      ExecContext* ctx = default_exec_context());
};

/// \brief Configure a grouped aggregation
struct ARROW_EXPORT Aggregate {
  /// the name of the aggregation function
  std::string function;

  /// options for the aggregation function
  const FunctionOptions* options;
};

/// Internal use only: helper function for testing HashAggregateKernels.
/// This will be replaced by streaming execution operators.
ARROW_EXPORT
Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
                      const std::vector<Aggregate>& aggregates, bool use_threads = false,
                      ExecContext* ctx = default_exec_context());

}  // namespace internal
}  // namespace compute
}  // namespace arrow
Commit	Line	Data
1d09f67e TL	1	// Licensed to the Apache Software Foundation (ASF) under one
	2	// or more contributor license agreements. See the NOTICE file
	3	// distributed with this work for additional information
	4	// regarding copyright ownership. The ASF licenses this file
	5	// to you under the Apache License, Version 2.0 (the
	6	// "License"); you may not use this file except in compliance
	7	// with the License. You may obtain a copy of the License at
	8	//
	9	// http://www.apache.org/licenses/LICENSE-2.0
	10	//
	11	// Unless required by applicable law or agreed to in writing,
	12	// software distributed under the License is distributed on an
	13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	// KIND, either express or implied. See the License for the
	15	// specific language governing permissions and limitations
	16	// under the License.
	17
	18	// Eager evaluation convenience APIs for invoking common functions, including
	19	// necessary memory allocations
	20
	21	#pragma once
	22
	23	#include "arrow/compute/function.h"
	24	#include "arrow/datum.h"
	25	#include "arrow/result.h"
	26	#include "arrow/util/macros.h"
	27	#include "arrow/util/visibility.h"
	28
	29	namespace arrow {
	30
	31	class Array;
	32
	33	namespace compute {
	34
	35	class ExecContext;
	36
	37	// ----------------------------------------------------------------------
	38	// Aggregate functions
	39
	40	/// \addtogroup compute-concrete-options
	41	/// @{
	42
	43	/// \brief Control general scalar aggregate kernel behavior
	44	///
	45	/// By default, null values are ignored (skip_nulls = true).
	46	class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
	47	public:
	48	explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
	49	constexpr static char const kTypeName[] = "ScalarAggregateOptions";
	50	static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
	51
	52	/// If true (the default), null values are ignored. Otherwise, if any value is null,
	53	/// emit null.
	54	bool skip_nulls;
	55	/// If less than this many non-null values are observed, emit null.
	56	uint32_t min_count;
	57	};
	58
	59	/// \brief Control count aggregate kernel behavior.
	60	///
	61	/// By default, only non-null values are counted.
	62	class ARROW_EXPORT CountOptions : public FunctionOptions {
	63	public:
	64	enum CountMode {
65	/// Count only non-null values.
66	ONLY_VALID = 0,
67	/// Count only null values.
68	ONLY_NULL,
69	/// Count both non-null and null values.
70	ALL,
71	};
72	explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
73	constexpr static char const kTypeName[] = "CountOptions";
74	static CountOptions Defaults() { return CountOptions{}; }
75
76	CountMode mode;
77	};
78
79	/// \brief Control Mode kernel behavior
80	///
81	/// Returns top-n common values and counts.
82	/// By default, returns the most common value and count.
83	class ARROW_EXPORT ModeOptions : public FunctionOptions {
84	public:
85	explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
86	constexpr static char const kTypeName[] = "ModeOptions";
87	static ModeOptions Defaults() { return ModeOptions{}; }
88
89	int64_t n = 1;
90	/// If true (the default), null values are ignored. Otherwise, if any value is null,
91	/// emit null.
92	bool skip_nulls;
93	/// If less than this many non-null values are observed, emit null.
94	uint32_t min_count;
95	};
96
97	/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
98	///
99	/// The divisor used in calculations is N - ddof, where N is the number of elements.
100	/// By default, ddof is zero, and population variance or stddev is returned.
101	class ARROW_EXPORT VarianceOptions : public FunctionOptions {
102	public:
103	explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
104	constexpr static char const kTypeName[] = "VarianceOptions";
105	static VarianceOptions Defaults() { return VarianceOptions{}; }
106
107	int ddof = 0;
108	/// If true (the default), null values are ignored. Otherwise, if any value is null,
109	/// emit null.
110	bool skip_nulls;
111	/// If less than this many non-null values are observed, emit null.
112	uint32_t min_count;
113	};
114
115	/// \brief Control Quantile kernel behavior
116	///
117	/// By default, returns the median value.
118	class ARROW_EXPORT QuantileOptions : public FunctionOptions {
119	public:
120	/// Interpolation method to use when quantile lies between two data points
121	enum Interpolation {
122	LINEAR = 0,
123	LOWER,
124	HIGHER,
125	NEAREST,
126	MIDPOINT,
127	};
128
129	explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
130	bool skip_nulls = true, uint32_t min_count = 0);
131
132	explicit QuantileOptions(std::vector<double> q,
133	enum Interpolation interpolation = LINEAR,
134	bool skip_nulls = true, uint32_t min_count = 0);
135
136	constexpr static char const kTypeName[] = "QuantileOptions";
137	static QuantileOptions Defaults() { return QuantileOptions{}; }
138
139	/// quantile must be between 0 and 1 inclusive
140	std::vector<double> q;
141	enum Interpolation interpolation;
142	/// If true (the default), null values are ignored. Otherwise, if any value is null,
143	/// emit null.
144	bool skip_nulls;
145	/// If less than this many non-null values are observed, emit null.
146	uint32_t min_count;
147	};
148
149	/// \brief Control TDigest approximate quantile kernel behavior
150	///
151	/// By default, returns the median value.
152	class ARROW_EXPORT TDigestOptions : public FunctionOptions {
153	public:
154	explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
155	uint32_t buffer_size = 500, bool skip_nulls = true,
156	uint32_t min_count = 0);
157	explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
158	uint32_t buffer_size = 500, bool skip_nulls = true,
159	uint32_t min_count = 0);
160	constexpr static char const kTypeName[] = "TDigestOptions";
161	static TDigestOptions Defaults() { return TDigestOptions{}; }
162
163	/// quantile must be between 0 and 1 inclusive
164	std::vector<double> q;
165	/// compression parameter, default 100
166	uint32_t delta;
167	/// input buffer size, default 500
168	uint32_t buffer_size;
169	/// If true (the default), null values are ignored. Otherwise, if any value is null,
170	/// emit null.
171	bool skip_nulls;
172	/// If less than this many non-null values are observed, emit null.
173	uint32_t min_count;
174	};
175
176	/// \brief Control Index kernel behavior
177	class ARROW_EXPORT IndexOptions : public FunctionOptions {
178	public:
179	explicit IndexOptions(std::shared_ptr<Scalar> value);
180	// Default constructor for serialization
181	IndexOptions();
182	constexpr static char const kTypeName[] = "IndexOptions";
183
184	std::shared_ptr<Scalar> value;
185	};
186
187	/// @}
188
189	/// \brief Count values in an array.
190	///
191	/// \param[in] options counting options, see CountOptions for more information
192	/// \param[in] datum to count
193	/// \param[in] ctx the function execution context, optional
194	/// \return out resulting datum
195	///
196	/// \since 1.0.0
197	/// \note API not yet finalized
198	ARROW_EXPORT
199	Result<Datum> Count(const Datum& datum,
200	const CountOptions& options = CountOptions::Defaults(),
201	ExecContext* ctx = NULLPTR);
202
203	/// \brief Compute the mean of a numeric array.
204	///
205	/// \param[in] value datum to compute the mean, expecting Array
206	/// \param[in] options see ScalarAggregateOptions for more information
207	/// \param[in] ctx the function execution context, optional
208	/// \return datum of the computed mean as a DoubleScalar
209	///
210	/// \since 1.0.0
211	/// \note API not yet finalized
212	ARROW_EXPORT
213	Result<Datum> Mean(
214	const Datum& value,
215	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
216	ExecContext* ctx = NULLPTR);
217
218	/// \brief Compute the product of values of a numeric array.
219	///
220	/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
221	/// \param[in] options see ScalarAggregateOptions for more information
222	/// \param[in] ctx the function execution context, optional
223	/// \return datum of the computed sum as a Scalar
224	///
225	/// \since 6.0.0
226	/// \note API not yet finalized
227	ARROW_EXPORT
228	Result<Datum> Product(
229	const Datum& value,
230	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
231	ExecContext* ctx = NULLPTR);
232
233	/// \brief Sum values of a numeric array.
234	///
235	/// \param[in] value datum to sum, expecting Array or ChunkedArray
236	/// \param[in] options see ScalarAggregateOptions for more information
237	/// \param[in] ctx the function execution context, optional
238	/// \return datum of the computed sum as a Scalar
239	///
240	/// \since 1.0.0
241	/// \note API not yet finalized
242	ARROW_EXPORT
243	Result<Datum> Sum(
244	const Datum& value,
245	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
246	ExecContext* ctx = NULLPTR);
247
248	/// \brief Calculate the min / max of a numeric array
249	///
250	/// This function returns both the min and max as a struct scalar, with type
251	/// struct<min: T, max: T>, where T is the input type
252	///
253	/// \param[in] value input datum, expecting Array or ChunkedArray
254	/// \param[in] options see ScalarAggregateOptions for more information
255	/// \param[in] ctx the function execution context, optional
256	/// \return resulting datum as a struct<min: T, max: T> scalar
257	///
258	/// \since 1.0.0
259	/// \note API not yet finalized
260	ARROW_EXPORT
261	Result<Datum> MinMax(
262	const Datum& value,
263	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
264	ExecContext* ctx = NULLPTR);
265
266	/// \brief Test whether any element in a boolean array evaluates to true.
267	///
268	/// This function returns true if any of the elements in the array evaluates
269	/// to true and false otherwise. Null values are ignored by default.
270	/// If null values are taken into account by setting ScalarAggregateOptions
271	/// parameter skip_nulls = false then Kleene logic is used.
272	/// See KleeneOr for more details on Kleene logic.
273	///
274	/// \param[in] value input datum, expecting a boolean array
275	/// \param[in] options see ScalarAggregateOptions for more information
276	/// \param[in] ctx the function execution context, optional
277	/// \return resulting datum as a BooleanScalar
278	///
279	/// \since 3.0.0
280	/// \note API not yet finalized
281	ARROW_EXPORT
282	Result<Datum> Any(
283	const Datum& value,
284	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
285	ExecContext* ctx = NULLPTR);
286
287	/// \brief Test whether all elements in a boolean array evaluate to true.
288	///
289	/// This function returns true if all of the elements in the array evaluate
290	/// to true and false otherwise. Null values are ignored by default.
291	/// If null values are taken into account by setting ScalarAggregateOptions
292	/// parameter skip_nulls = false then Kleene logic is used.
293	/// See KleeneAnd for more details on Kleene logic.
294	///
295	/// \param[in] value input datum, expecting a boolean array
296	/// \param[in] options see ScalarAggregateOptions for more information
297	/// \param[in] ctx the function execution context, optional
298	/// \return resulting datum as a BooleanScalar
299
300	/// \since 3.0.0
301	/// \note API not yet finalized
302	ARROW_EXPORT
303	Result<Datum> All(
304	const Datum& value,
305	const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
306	ExecContext* ctx = NULLPTR);
307
308	/// \brief Calculate the modal (most common) value of a numeric array
309	///
310	/// This function returns top-n most common values and number of times they occur as
311	/// an array of `struct<mode: T, count: int64>`, where T is the input type.
312	/// Values with larger counts are returned before smaller ones.
313	/// If there are more than one values with same count, smaller value is returned first.
314	///
315	/// \param[in] value input datum, expecting Array or ChunkedArray
316	/// \param[in] options see ModeOptions for more information
317	/// \param[in] ctx the function execution context, optional
318	/// \return resulting datum as an array of struct<mode: T, count: int64>
319	///
320	/// \since 2.0.0
321	/// \note API not yet finalized
322	ARROW_EXPORT
323	Result<Datum> Mode(const Datum& value,
324	const ModeOptions& options = ModeOptions::Defaults(),
325	ExecContext* ctx = NULLPTR);
326
327	/// \brief Calculate the standard deviation of a numeric array
328	///
329	/// \param[in] value input datum, expecting Array or ChunkedArray
330	/// \param[in] options see VarianceOptions for more information
331	/// \param[in] ctx the function execution context, optional
332	/// \return datum of the computed standard deviation as a DoubleScalar
333	///
334	/// \since 2.0.0
335	/// \note API not yet finalized
336	ARROW_EXPORT
337	Result<Datum> Stddev(const Datum& value,
338	const VarianceOptions& options = VarianceOptions::Defaults(),
339	ExecContext* ctx = NULLPTR);
340
341	/// \brief Calculate the variance of a numeric array
342	///
343	/// \param[in] value input datum, expecting Array or ChunkedArray
344	/// \param[in] options see VarianceOptions for more information
345	/// \param[in] ctx the function execution context, optional
346	/// \return datum of the computed variance as a DoubleScalar
347	///
348	/// \since 2.0.0
349	/// \note API not yet finalized
350	ARROW_EXPORT
351	Result<Datum> Variance(const Datum& value,
352	const VarianceOptions& options = VarianceOptions::Defaults(),
353	ExecContext* ctx = NULLPTR);
354
355	/// \brief Calculate the quantiles of a numeric array
356	///
357	/// \param[in] value input datum, expecting Array or ChunkedArray
358	/// \param[in] options see QuantileOptions for more information
359	/// \param[in] ctx the function execution context, optional
360	/// \return resulting datum as an array
361	///
362	/// \since 4.0.0
363	/// \note API not yet finalized
364	ARROW_EXPORT
365	Result<Datum> Quantile(const Datum& value,
366	const QuantileOptions& options = QuantileOptions::Defaults(),
367	ExecContext* ctx = NULLPTR);
368
369	/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
370	///
371	/// \param[in] value input datum, expecting Array or ChunkedArray
372	/// \param[in] options see TDigestOptions for more information
373	/// \param[in] ctx the function execution context, optional
374	/// \return resulting datum as an array
375	///
376	/// \since 4.0.0
377	/// \note API not yet finalized
378	ARROW_EXPORT
379	Result<Datum> TDigest(const Datum& value,
380	const TDigestOptions& options = TDigestOptions::Defaults(),
381	ExecContext* ctx = NULLPTR);
382
383	/// \brief Find the first index of a value in an array.
384	///
385	/// \param[in] value The array to search.
386	/// \param[in] options The array to search for. See IndexOoptions.
387	/// \param[in] ctx the function execution context, optional
388	/// \return out a Scalar containing the index (or -1 if not found).
389	///
390	/// \since 5.0.0
391	/// \note API not yet finalized
392	ARROW_EXPORT
393	Result<Datum> Index(const Datum& value, const IndexOptions& options,
394	ExecContext* ctx = NULLPTR);
395
396	namespace internal {
397
398	/// Internal use only: streaming group identifier.
399	/// Consumes batches of keys and yields batches of the group ids.
400	class ARROW_EXPORT Grouper {
401	public:
402	virtual ~Grouper() = default;
403
404	/// Construct a Grouper which receives the specified key types
405	static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
406	ExecContext* ctx = default_exec_context());
407
408	/// Consume a batch of keys, producing the corresponding group ids as an integer array.
409	/// Currently only uint32 indices will be produced, eventually the bit width will only
410	/// be as wide as necessary.
411	virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
412
413	/// Get current unique keys. May be called multiple times.
414	virtual Result<ExecBatch> GetUniques() = 0;
415
416	/// Get the current number of groups.
417	virtual uint32_t num_groups() const = 0;
418
419	/// \brief Assemble lists of indices of identical elements.
420	///
421	/// \param[in] ids An unsigned, all-valid integral array which will be
422	/// used as grouping criteria.
423	/// \param[in] num_groups An upper bound for the elements of ids
424	/// \return A num_groups-long ListArray where the slot at i contains a
425	/// list of indices where i appears in ids.
426	///
427	/// MakeGroupings([
428	/// 2,
429	/// 2,
430	/// 5,
431	/// 5,
432	/// 2,
433	/// 3
434	/// ], 8) == [
435	/// [],
436	/// [],
437	/// [0, 1, 4],
438	/// [5],
439	/// [],
440	/// [2, 3],
441	/// [],
442	/// []
443	/// ]
444	static Result<std::shared_ptr<ListArray>> MakeGroupings(
445	const UInt32Array& ids, uint32_t num_groups,
446	ExecContext* ctx = default_exec_context());
447
448	/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
449	/// the provided groupings.
450	///
451	/// For example,
452	/// ApplyGroupings([
453	/// [],
454	/// [],
455	/// [0, 1, 4],
456	/// [5],
457	/// [],
458	/// [2, 3],
459	/// [],
460	/// []
461	/// ], [2, 2, 5, 5, 2, 3]) == [
462	/// [],
463	/// [],
464	/// [2, 2, 2],
465	/// [3],
466	/// [],
467	/// [5, 5],
468	/// [],
469	/// []
470	/// ]
471	static Result<std::shared_ptr<ListArray>> ApplyGroupings(
472	const ListArray& groupings, const Array& array,
473	ExecContext* ctx = default_exec_context());
474	};
475
476	/// \brief Configure a grouped aggregation
477	struct ARROW_EXPORT Aggregate {
478	/// the name of the aggregation function
479	std::string function;
480
481	/// options for the aggregation function
482	const FunctionOptions* options;
483	};
484
485	/// Internal use only: helper function for testing HashAggregateKernels.
486	/// This will be replaced by streaming execution operators.
487	ARROW_EXPORT
488	Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
489	const std::vector<Aggregate>& aggregates, bool use_threads = false,
490	ExecContext* ctx = default_exec_context());
491
492	} // namespace internal
493	} // namespace compute
494	} // namespace arrow