]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <cstdint> | |
21 | #include <memory> | |
22 | #include <string> | |
23 | #include <vector> | |
24 | ||
25 | #include "arrow/chunked_array.h" // IWYU pragma: keep | |
26 | #include "arrow/record_batch.h" | |
27 | #include "arrow/status.h" | |
28 | #include "arrow/type.h" | |
29 | #include "arrow/type_fwd.h" | |
30 | #include "arrow/util/macros.h" | |
31 | #include "arrow/util/visibility.h" | |
32 | ||
33 | namespace arrow { | |
34 | ||
35 | class Array; | |
36 | class ChunkedArray; | |
37 | class KeyValueMetadata; | |
38 | class MemoryPool; | |
39 | ||
40 | /// \class Table | |
41 | /// \brief Logical table as sequence of chunked arrays | |
42 | class ARROW_EXPORT Table { | |
43 | public: | |
44 | virtual ~Table() = default; | |
45 | ||
46 | /// \brief Construct a Table from schema and columns | |
47 | /// | |
48 | /// If columns is zero-length, the table's number of rows is zero | |
49 | /// | |
50 | /// \param[in] schema The table schema (column types) | |
51 | /// \param[in] columns The table's columns as chunked arrays | |
52 | /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns | |
53 | static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, | |
54 | std::vector<std::shared_ptr<ChunkedArray>> columns, | |
55 | int64_t num_rows = -1); | |
56 | ||
57 | /// \brief Construct a Table from schema and arrays | |
58 | /// | |
59 | /// \param[in] schema The table schema (column types) | |
60 | /// \param[in] arrays The table's columns as arrays | |
61 | /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns | |
62 | static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, | |
63 | const std::vector<std::shared_ptr<Array>>& arrays, | |
64 | int64_t num_rows = -1); | |
65 | ||
66 | /// \brief Construct a Table from a RecordBatchReader. | |
67 | /// | |
68 | /// \param[in] reader the arrow::Schema for each batch | |
69 | static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader); | |
70 | ||
71 | /// \brief Construct a Table from RecordBatches, using schema supplied by the first | |
72 | /// RecordBatch. | |
73 | /// | |
74 | /// \param[in] batches a std::vector of record batches | |
75 | static Result<std::shared_ptr<Table>> FromRecordBatches( | |
76 | const std::vector<std::shared_ptr<RecordBatch>>& batches); | |
77 | ||
78 | /// \brief Construct a Table from RecordBatches, using supplied schema. There may be | |
79 | /// zero record batches | |
80 | /// | |
81 | /// \param[in] schema the arrow::Schema for each batch | |
82 | /// \param[in] batches a std::vector of record batches | |
83 | static Result<std::shared_ptr<Table>> FromRecordBatches( | |
84 | std::shared_ptr<Schema> schema, | |
85 | const std::vector<std::shared_ptr<RecordBatch>>& batches); | |
86 | ||
87 | /// \brief Construct a Table from a chunked StructArray. One column will be produced | |
88 | /// for each field of the StructArray. | |
89 | /// | |
90 | /// \param[in] array a chunked StructArray | |
91 | static Result<std::shared_ptr<Table>> FromChunkedStructArray( | |
92 | const std::shared_ptr<ChunkedArray>& array); | |
93 | ||
94 | /// \brief Return the table schema | |
95 | const std::shared_ptr<Schema>& schema() const { return schema_; } | |
96 | ||
97 | /// \brief Return a column by index | |
98 | virtual std::shared_ptr<ChunkedArray> column(int i) const = 0; | |
99 | ||
100 | /// \brief Return vector of all columns for table | |
101 | virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0; | |
102 | ||
103 | /// Return a column's field by index | |
104 | std::shared_ptr<Field> field(int i) const { return schema_->field(i); } | |
105 | ||
106 | /// \brief Return vector of all fields for table | |
107 | std::vector<std::shared_ptr<Field>> fields() const; | |
108 | ||
109 | /// \brief Construct a zero-copy slice of the table with the | |
110 | /// indicated offset and length | |
111 | /// | |
112 | /// \param[in] offset the index of the first row in the constructed | |
113 | /// slice | |
114 | /// \param[in] length the number of rows of the slice. If there are not enough | |
115 | /// rows in the table, the length will be adjusted accordingly | |
116 | /// | |
117 | /// \return a new object wrapped in std::shared_ptr<Table> | |
118 | virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0; | |
119 | ||
120 | /// \brief Slice from first row at offset until end of the table | |
121 | std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); } | |
122 | ||
123 | /// \brief Return a column by name | |
124 | /// \param[in] name field name | |
125 | /// \return an Array or null if no field was found | |
126 | std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const { | |
127 | auto i = schema_->GetFieldIndex(name); | |
128 | return i == -1 ? NULLPTR : column(i); | |
129 | } | |
130 | ||
131 | /// \brief Remove column from the table, producing a new Table | |
132 | virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0; | |
133 | ||
134 | /// \brief Add column to the table, producing a new Table | |
135 | virtual Result<std::shared_ptr<Table>> AddColumn( | |
136 | int i, std::shared_ptr<Field> field_arg, | |
137 | std::shared_ptr<ChunkedArray> column) const = 0; | |
138 | ||
139 | /// \brief Replace a column in the table, producing a new Table | |
140 | virtual Result<std::shared_ptr<Table>> SetColumn( | |
141 | int i, std::shared_ptr<Field> field_arg, | |
142 | std::shared_ptr<ChunkedArray> column) const = 0; | |
143 | ||
144 | /// \brief Return names of all columns | |
145 | std::vector<std::string> ColumnNames() const; | |
146 | ||
147 | /// \brief Rename columns with provided names | |
148 | Result<std::shared_ptr<Table>> RenameColumns( | |
149 | const std::vector<std::string>& names) const; | |
150 | ||
151 | /// \brief Return new table with specified columns | |
152 | Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const; | |
153 | ||
154 | /// \brief Replace schema key-value metadata with new metadata | |
155 | /// \since 0.5.0 | |
156 | /// | |
157 | /// \param[in] metadata new KeyValueMetadata | |
158 | /// \return new Table | |
159 | virtual std::shared_ptr<Table> ReplaceSchemaMetadata( | |
160 | const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0; | |
161 | ||
162 | /// \brief Flatten the table, producing a new Table. Any column with a | |
163 | /// struct type will be flattened into multiple columns | |
164 | /// | |
165 | /// \param[in] pool The pool for buffer allocations, if any | |
166 | virtual Result<std::shared_ptr<Table>> Flatten( | |
167 | MemoryPool* pool = default_memory_pool()) const = 0; | |
168 | ||
169 | /// \return PrettyPrint representation suitable for debugging | |
170 | std::string ToString() const; | |
171 | ||
172 | /// \brief Perform cheap validation checks to determine obvious inconsistencies | |
173 | /// within the table's schema and internal data. | |
174 | /// | |
175 | /// This is O(k*m) where k is the total number of field descendents, | |
176 | /// and m is the number of chunks. | |
177 | /// | |
178 | /// \return Status | |
179 | virtual Status Validate() const = 0; | |
180 | ||
181 | /// \brief Perform extensive validation checks to determine inconsistencies | |
182 | /// within the table's schema and internal data. | |
183 | /// | |
184 | /// This is O(k*n) where k is the total number of field descendents, | |
185 | /// and n is the number of rows. | |
186 | /// | |
187 | /// \return Status | |
188 | virtual Status ValidateFull() const = 0; | |
189 | ||
190 | /// \brief Return the number of columns in the table | |
191 | int num_columns() const { return schema_->num_fields(); } | |
192 | ||
193 | /// \brief Return the number of rows (equal to each column's logical length) | |
194 | int64_t num_rows() const { return num_rows_; } | |
195 | ||
196 | /// \brief Determine if tables are equal | |
197 | /// | |
198 | /// Two tables can be equal only if they have equal schemas. | |
199 | /// However, they may be equal even if they have different chunkings. | |
200 | bool Equals(const Table& other, bool check_metadata = false) const; | |
201 | ||
202 | /// \brief Make a new table by combining the chunks this table has. | |
203 | /// | |
204 | /// All the underlying chunks in the ChunkedArray of each column are | |
205 | /// concatenated into zero or one chunk. | |
206 | /// | |
207 | /// \param[in] pool The pool for buffer allocations | |
208 | Result<std::shared_ptr<Table>> CombineChunks( | |
209 | MemoryPool* pool = default_memory_pool()) const; | |
210 | ||
211 | protected: | |
212 | Table(); | |
213 | ||
214 | std::shared_ptr<Schema> schema_; | |
215 | int64_t num_rows_; | |
216 | ||
217 | private: | |
218 | ARROW_DISALLOW_COPY_AND_ASSIGN(Table); | |
219 | }; | |
220 | ||
221 | /// \brief Compute a stream of record batches from a (possibly chunked) Table | |
222 | /// | |
223 | /// The conversion is zero-copy: each record batch is a view over a slice | |
224 | /// of the table's columns. | |
225 | class ARROW_EXPORT TableBatchReader : public RecordBatchReader { | |
226 | public: | |
227 | /// \brief Construct a TableBatchReader for the given table | |
228 | explicit TableBatchReader(const Table& table); | |
229 | ||
230 | std::shared_ptr<Schema> schema() const override; | |
231 | ||
232 | Status ReadNext(std::shared_ptr<RecordBatch>* out) override; | |
233 | ||
234 | /// \brief Set the desired maximum chunk size of record batches | |
235 | /// | |
236 | /// The actual chunk size of each record batch may be smaller, depending | |
237 | /// on actual chunking characteristics of each table column. | |
238 | void set_chunksize(int64_t chunksize); | |
239 | ||
240 | private: | |
241 | const Table& table_; | |
242 | std::vector<ChunkedArray*> column_data_; | |
243 | std::vector<int> chunk_numbers_; | |
244 | std::vector<int64_t> chunk_offsets_; | |
245 | int64_t absolute_row_position_; | |
246 | int64_t max_chunksize_; | |
247 | }; | |
248 | ||
249 | /// \defgroup concat-tables ConcatenateTables function. | |
250 | /// | |
251 | /// ConcatenateTables function. | |
252 | /// @{ | |
253 | ||
254 | /// \brief Controls the behavior of ConcatenateTables(). | |
255 | struct ARROW_EXPORT ConcatenateTablesOptions { | |
256 | /// If true, the schemas of the tables will be first unified with fields of | |
257 | /// the same name being merged, according to `field_merge_options`, then each | |
258 | /// table will be promoted to the unified schema before being concatenated. | |
259 | /// Otherwise, all tables should have the same schema. Each column in the output table | |
260 | /// is the result of concatenating the corresponding columns in all input tables. | |
261 | bool unify_schemas = false; | |
262 | ||
263 | Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); | |
264 | ||
265 | static ConcatenateTablesOptions Defaults() { return {}; } | |
266 | }; | |
267 | ||
268 | /// \brief Construct table from multiple input tables. | |
269 | ARROW_EXPORT | |
270 | Result<std::shared_ptr<Table>> ConcatenateTables( | |
271 | const std::vector<std::shared_ptr<Table>>& tables, | |
272 | ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(), | |
273 | MemoryPool* memory_pool = default_memory_pool()); | |
274 | ||
275 | /// \brief Promotes a table to conform to the given schema. | |
276 | /// | |
277 | /// If a field in the schema does not have a corresponding column in the | |
278 | /// table, a column of nulls will be added to the resulting table. | |
279 | /// If the corresponding column is of type Null, it will be promoted to | |
280 | /// the type specified by schema, with null values filled. | |
281 | /// Returns an error: | |
282 | /// - if the corresponding column's type is not compatible with the | |
283 | /// schema. | |
284 | /// - if there is a column in the table that does not exist in the schema. | |
285 | /// | |
286 | /// \param[in] table the input Table | |
287 | /// \param[in] schema the target schema to promote to | |
288 | /// \param[in] pool The memory pool to be used if null-filled arrays need to | |
289 | /// be created. | |
290 | ARROW_EXPORT | |
291 | Result<std::shared_ptr<Table>> PromoteTableToSchema( | |
292 | const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema, | |
293 | MemoryPool* pool = default_memory_pool()); | |
294 | ||
295 | } // namespace arrow |