]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/table.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / table.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cstdint>
21#include <memory>
22#include <string>
23#include <vector>
24
25#include "arrow/chunked_array.h" // IWYU pragma: keep
26#include "arrow/record_batch.h"
27#include "arrow/status.h"
28#include "arrow/type.h"
29#include "arrow/type_fwd.h"
30#include "arrow/util/macros.h"
31#include "arrow/util/visibility.h"
32
33namespace arrow {
34
35class Array;
36class ChunkedArray;
37class KeyValueMetadata;
38class MemoryPool;
39
40/// \class Table
41/// \brief Logical table as sequence of chunked arrays
42class ARROW_EXPORT Table {
43 public:
44 virtual ~Table() = default;
45
46 /// \brief Construct a Table from schema and columns
47 ///
48 /// If columns is zero-length, the table's number of rows is zero
49 ///
50 /// \param[in] schema The table schema (column types)
51 /// \param[in] columns The table's columns as chunked arrays
52 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
53 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
54 std::vector<std::shared_ptr<ChunkedArray>> columns,
55 int64_t num_rows = -1);
56
57 /// \brief Construct a Table from schema and arrays
58 ///
59 /// \param[in] schema The table schema (column types)
60 /// \param[in] arrays The table's columns as arrays
61 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
62 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
63 const std::vector<std::shared_ptr<Array>>& arrays,
64 int64_t num_rows = -1);
65
66 /// \brief Construct a Table from a RecordBatchReader.
67 ///
68 /// \param[in] reader the arrow::Schema for each batch
69 static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader);
70
71 /// \brief Construct a Table from RecordBatches, using schema supplied by the first
72 /// RecordBatch.
73 ///
74 /// \param[in] batches a std::vector of record batches
75 static Result<std::shared_ptr<Table>> FromRecordBatches(
76 const std::vector<std::shared_ptr<RecordBatch>>& batches);
77
78 /// \brief Construct a Table from RecordBatches, using supplied schema. There may be
79 /// zero record batches
80 ///
81 /// \param[in] schema the arrow::Schema for each batch
82 /// \param[in] batches a std::vector of record batches
83 static Result<std::shared_ptr<Table>> FromRecordBatches(
84 std::shared_ptr<Schema> schema,
85 const std::vector<std::shared_ptr<RecordBatch>>& batches);
86
87 /// \brief Construct a Table from a chunked StructArray. One column will be produced
88 /// for each field of the StructArray.
89 ///
90 /// \param[in] array a chunked StructArray
91 static Result<std::shared_ptr<Table>> FromChunkedStructArray(
92 const std::shared_ptr<ChunkedArray>& array);
93
94 /// \brief Return the table schema
95 const std::shared_ptr<Schema>& schema() const { return schema_; }
96
97 /// \brief Return a column by index
98 virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
99
100 /// \brief Return vector of all columns for table
101 virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
102
103 /// Return a column's field by index
104 std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
105
106 /// \brief Return vector of all fields for table
107 std::vector<std::shared_ptr<Field>> fields() const;
108
109 /// \brief Construct a zero-copy slice of the table with the
110 /// indicated offset and length
111 ///
112 /// \param[in] offset the index of the first row in the constructed
113 /// slice
114 /// \param[in] length the number of rows of the slice. If there are not enough
115 /// rows in the table, the length will be adjusted accordingly
116 ///
117 /// \return a new object wrapped in std::shared_ptr<Table>
118 virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0;
119
120 /// \brief Slice from first row at offset until end of the table
121 std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); }
122
123 /// \brief Return a column by name
124 /// \param[in] name field name
125 /// \return an Array or null if no field was found
126 std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const {
127 auto i = schema_->GetFieldIndex(name);
128 return i == -1 ? NULLPTR : column(i);
129 }
130
131 /// \brief Remove column from the table, producing a new Table
132 virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0;
133
134 /// \brief Add column to the table, producing a new Table
135 virtual Result<std::shared_ptr<Table>> AddColumn(
136 int i, std::shared_ptr<Field> field_arg,
137 std::shared_ptr<ChunkedArray> column) const = 0;
138
139 /// \brief Replace a column in the table, producing a new Table
140 virtual Result<std::shared_ptr<Table>> SetColumn(
141 int i, std::shared_ptr<Field> field_arg,
142 std::shared_ptr<ChunkedArray> column) const = 0;
143
144 /// \brief Return names of all columns
145 std::vector<std::string> ColumnNames() const;
146
147 /// \brief Rename columns with provided names
148 Result<std::shared_ptr<Table>> RenameColumns(
149 const std::vector<std::string>& names) const;
150
151 /// \brief Return new table with specified columns
152 Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
153
154 /// \brief Replace schema key-value metadata with new metadata
155 /// \since 0.5.0
156 ///
157 /// \param[in] metadata new KeyValueMetadata
158 /// \return new Table
159 virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
160 const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
161
162 /// \brief Flatten the table, producing a new Table. Any column with a
163 /// struct type will be flattened into multiple columns
164 ///
165 /// \param[in] pool The pool for buffer allocations, if any
166 virtual Result<std::shared_ptr<Table>> Flatten(
167 MemoryPool* pool = default_memory_pool()) const = 0;
168
169 /// \return PrettyPrint representation suitable for debugging
170 std::string ToString() const;
171
172 /// \brief Perform cheap validation checks to determine obvious inconsistencies
173 /// within the table's schema and internal data.
174 ///
175 /// This is O(k*m) where k is the total number of field descendents,
176 /// and m is the number of chunks.
177 ///
178 /// \return Status
179 virtual Status Validate() const = 0;
180
181 /// \brief Perform extensive validation checks to determine inconsistencies
182 /// within the table's schema and internal data.
183 ///
184 /// This is O(k*n) where k is the total number of field descendents,
185 /// and n is the number of rows.
186 ///
187 /// \return Status
188 virtual Status ValidateFull() const = 0;
189
190 /// \brief Return the number of columns in the table
191 int num_columns() const { return schema_->num_fields(); }
192
193 /// \brief Return the number of rows (equal to each column's logical length)
194 int64_t num_rows() const { return num_rows_; }
195
196 /// \brief Determine if tables are equal
197 ///
198 /// Two tables can be equal only if they have equal schemas.
199 /// However, they may be equal even if they have different chunkings.
200 bool Equals(const Table& other, bool check_metadata = false) const;
201
202 /// \brief Make a new table by combining the chunks this table has.
203 ///
204 /// All the underlying chunks in the ChunkedArray of each column are
205 /// concatenated into zero or one chunk.
206 ///
207 /// \param[in] pool The pool for buffer allocations
208 Result<std::shared_ptr<Table>> CombineChunks(
209 MemoryPool* pool = default_memory_pool()) const;
210
211 protected:
212 Table();
213
214 std::shared_ptr<Schema> schema_;
215 int64_t num_rows_;
216
217 private:
218 ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
219};
220
221/// \brief Compute a stream of record batches from a (possibly chunked) Table
222///
223/// The conversion is zero-copy: each record batch is a view over a slice
224/// of the table's columns.
225class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
226 public:
227 /// \brief Construct a TableBatchReader for the given table
228 explicit TableBatchReader(const Table& table);
229
230 std::shared_ptr<Schema> schema() const override;
231
232 Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
233
234 /// \brief Set the desired maximum chunk size of record batches
235 ///
236 /// The actual chunk size of each record batch may be smaller, depending
237 /// on actual chunking characteristics of each table column.
238 void set_chunksize(int64_t chunksize);
239
240 private:
241 const Table& table_;
242 std::vector<ChunkedArray*> column_data_;
243 std::vector<int> chunk_numbers_;
244 std::vector<int64_t> chunk_offsets_;
245 int64_t absolute_row_position_;
246 int64_t max_chunksize_;
247};
248
249/// \defgroup concat-tables ConcatenateTables function.
250///
251/// ConcatenateTables function.
252/// @{
253
254/// \brief Controls the behavior of ConcatenateTables().
255struct ARROW_EXPORT ConcatenateTablesOptions {
256 /// If true, the schemas of the tables will be first unified with fields of
257 /// the same name being merged, according to `field_merge_options`, then each
258 /// table will be promoted to the unified schema before being concatenated.
259 /// Otherwise, all tables should have the same schema. Each column in the output table
260 /// is the result of concatenating the corresponding columns in all input tables.
261 bool unify_schemas = false;
262
263 Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
264
265 static ConcatenateTablesOptions Defaults() { return {}; }
266};
267
268/// \brief Construct table from multiple input tables.
269ARROW_EXPORT
270Result<std::shared_ptr<Table>> ConcatenateTables(
271 const std::vector<std::shared_ptr<Table>>& tables,
272 ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(),
273 MemoryPool* memory_pool = default_memory_pool());
274
275/// \brief Promotes a table to conform to the given schema.
276///
277/// If a field in the schema does not have a corresponding column in the
278/// table, a column of nulls will be added to the resulting table.
279/// If the corresponding column is of type Null, it will be promoted to
280/// the type specified by schema, with null values filled.
281/// Returns an error:
282/// - if the corresponding column's type is not compatible with the
283/// schema.
284/// - if there is a column in the table that does not exist in the schema.
285///
286/// \param[in] table the input Table
287/// \param[in] schema the target schema to promote to
288/// \param[in] pool The memory pool to be used if null-filled arrays need to
289/// be created.
290ARROW_EXPORT
291Result<std::shared_ptr<Table>> PromoteTableToSchema(
292 const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
293 MemoryPool* pool = default_memory_pool());
294
295} // namespace arrow