ceph/src/arrow/cpp/src/parquet/arrow/reader.h

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 #pragma once
  19
  20 #include <cstdint>
  21 // N.B. we don't include async_generator.h as it's relatively heavy
  22 #include <functional>
  23 #include <memory>
  24 #include <vector>
  25
  26 #include "parquet/file_reader.h"
  27 #include "parquet/platform.h"
  28 #include "parquet/properties.h"
  29
  30 namespace arrow {
  31
  32 class ChunkedArray;
  33 class KeyValueMetadata;
  34 class RecordBatchReader;
  35 struct Scalar;
  36 class Schema;
  37 class Table;
  38 class RecordBatch;
  39
  40 }  // namespace arrow
  41
  42 namespace parquet {
  43
  44 class FileMetaData;
  45 class SchemaDescriptor;
  46
  47 namespace arrow {
  48
  49 class ColumnChunkReader;
  50 class ColumnReader;
  51 struct SchemaManifest;
  52 class RowGroupReader;
  53
  54 /// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
  55 ///
  56 /// This interfaces caters for different use cases and thus provides different
  57 /// interfaces. In its most simplistic form, we cater for a user that wants to
  58 /// read the whole Parquet at once with the `FileReader::ReadTable` method.
  59 ///
  60 /// More advanced users that also want to implement parallelism on top of each
  61 /// single Parquet files should do this on the RowGroup level. For this, they can
  62 /// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
  63 /// RowGroup as a table.
  64 ///
  65 /// In the most advanced situation, where a consumer wants to independently read
  66 /// RowGroups in parallel and consume each column individually, they can call
  67 /// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
  68 /// instance.
  69 ///
  70 /// The parquet format supports an optional integer field_id which can be assigned
  71 /// to a field.  Arrow will convert these field IDs to a metadata key named
  72 /// PARQUET:field_id on the appropriate field.
  73 // TODO(wesm): nested data does not always make sense with this user
  74 // interface unless you are only reading a single leaf node from a branch of
  75 // a table. For example:
  76 //
  77 // repeated group data {
  78 //   optional group record {
  79 //     optional int32 val1;
  80 //     optional byte_array val2;
  81 //     optional bool val3;
  82 //   }
  83 //   optional int32 val4;
  84 // }
  85 //
  86 // In the Parquet file, there are 3 leaf nodes:
  87 //
  88 // * data.record.val1
  89 // * data.record.val2
  90 // * data.record.val3
  91 // * data.val4
  92 //
  93 // When materializing this data in an Arrow array, we would have:
  94 //
  95 // data: list<struct<
  96 //   record: struct<
  97 //    val1: int32,
  98 //    val2: string (= list<uint8>),
  99 //    val3: bool,
 100 //   >,
 101 //   val4: int32
 102 // >>
 103 //
 104 // However, in the Parquet format, each leaf node has its own repetition and
 105 // definition levels describing the structure of the intermediate nodes in
 106 // this array structure. Thus, we will need to scan the leaf data for a group
 107 // of leaf nodes part of the same type tree to create a single result Arrow
 108 // nested array structure.
 109 //
 110 // This is additionally complicated "chunky" repeated fields or very large byte
 111 // arrays
 112 class PARQUET_EXPORT FileReader {
 113  public:
 114   /// Factory function to create a FileReader from a ParquetFileReader and properties
 115   static ::arrow::Status Make(::arrow::MemoryPool* pool,
 116                               std::unique_ptr<ParquetFileReader> reader,
 117                               const ArrowReaderProperties& properties,
 118                               std::unique_ptr<FileReader>* out);
 119
 120   /// Factory function to create a FileReader from a ParquetFileReader
 121   static ::arrow::Status Make(::arrow::MemoryPool* pool,
 122                               std::unique_ptr<ParquetFileReader> reader,
 123                               std::unique_ptr<FileReader>* out);
 124
 125   // Since the distribution of columns amongst a Parquet file's row groups may
 126   // be uneven (the number of values in each column chunk can be different), we
 127   // provide a column-oriented read interface. The ColumnReader hides the
 128   // details of paging through the file's row groups and yielding
 129   // fully-materialized arrow::Array instances
 130   //
 131   // Returns error status if the column of interest is not flat.
 132   virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
 133
 134   /// \brief Return arrow schema for all the columns.
 135   virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
 136
 137   /// \brief Read column as a whole into a chunked array.
 138   ///
 139   /// The indicated column index is relative to the schema
 140   virtual ::arrow::Status ReadColumn(int i,
 141                                      std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
 142
 143   // NOTE: Experimental API
 144   // Reads a specific top level schema field into an Array
 145   // The index i refers the index of the top level schema field, which may
 146   // be nested or flat - e.g.
 147   //
 148   // 0 foo.bar
 149   //   foo.bar.baz
 150   //   foo.qux
 151   // 1 foo2
 152   // 2 foo3
 153   //
 154   // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
 155   virtual ::arrow::Status ReadSchemaField(
 156       int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
 157
 158   /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
 159   ///
 160   /// Note that the ordering in row_group_indices matters. FileReaders must outlive
 161   /// their RecordBatchReaders.
 162   ///
 163   /// \returns error Status if row_group_indices contains an invalid index
 164   virtual ::arrow::Status GetRecordBatchReader(
 165       const std::vector<int>& row_group_indices,
 166       std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 167
 168   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
 169                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
 170
 171   /// \brief Return a RecordBatchReader of row groups selected from
 172   /// row_group_indices, whose columns are selected by column_indices.
 173   ///
 174   /// Note that the ordering in row_group_indices and column_indices
 175   /// matter. FileReaders must outlive their RecordBatchReaders.
 176   ///
 177   /// \returns error Status if either row_group_indices or column_indices
 178   ///     contains an invalid index
 179   virtual ::arrow::Status GetRecordBatchReader(
 180       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
 181       std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 182
 183   /// \brief Return a generator of record batches.
 184   ///
 185   /// The FileReader must outlive the generator, so this requires that you pass in a
 186   /// shared_ptr.
 187   ///
 188   /// \returns error Result if either row_group_indices or column_indices contains an
 189   ///     invalid index
 190   virtual ::arrow::Result<
 191       std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
 192   GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
 193                           const std::vector<int> row_group_indices,
 194                           const std::vector<int> column_indices,
 195                           ::arrow::internal::Executor* cpu_executor = NULLPTR,
 196                           int row_group_readahead = 0) = 0;
 197
 198   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
 199                                        const std::vector<int>& column_indices,
 200                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
 201
 202   /// Read all columns into a Table
 203   virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
 204
 205   /// \brief Read the given columns into a Table
 206   ///
 207   /// The indicated column indices are relative to the schema
 208   virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
 209                                     std::shared_ptr<::arrow::Table>* out) = 0;
 210
 211   virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
 212                                        std::shared_ptr<::arrow::Table>* out) = 0;
 213
 214   virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
 215
 216   virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
 217                                         const std::vector<int>& column_indices,
 218                                         std::shared_ptr<::arrow::Table>* out) = 0;
 219
 220   virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
 221                                         std::shared_ptr<::arrow::Table>* out) = 0;
 222
 223   /// \brief Scan file contents with one thread, return number of rows
 224   virtual ::arrow::Status ScanContents(std::vector<int> columns,
 225                                        const int32_t column_batch_size,
 226                                        int64_t* num_rows) = 0;
 227
 228   /// \brief Return a reader for the RowGroup, this object must not outlive the
 229   ///   FileReader.
 230   virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
 231
 232   /// \brief The number of row groups in the file
 233   virtual int num_row_groups() const = 0;
 234
 235   virtual ParquetFileReader* parquet_reader() const = 0;
 236
 237   /// Set whether to use multiple threads during reads of multiple columns.
 238   /// By default only one thread is used.
 239   virtual void set_use_threads(bool use_threads) = 0;
 240
 241   /// Set number of records to read per batch for the RecordBatchReader.
 242   virtual void set_batch_size(int64_t batch_size) = 0;
 243
 244   virtual const ArrowReaderProperties& properties() const = 0;
 245
 246   virtual const SchemaManifest& manifest() const = 0;
 247
 248   virtual ~FileReader() = default;
 249 };
 250
 251 class RowGroupReader {
 252  public:
 253   virtual ~RowGroupReader() = default;
 254   virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
 255   virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
 256                                     std::shared_ptr<::arrow::Table>* out) = 0;
 257   virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
 258
 259  private:
 260   struct Iterator;
 261 };
 262
 263 class ColumnChunkReader {
 264  public:
 265   virtual ~ColumnChunkReader() = default;
 266   virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
 267 };
 268
 269 // At this point, the column reader is a stream iterator. It only knows how to
 270 // read the next batch of values for a particular column from the file until it
 271 // runs out.
 272 //
 273 // We also do not expose any internal Parquet details, such as row groups. This
 274 // might change in the future.
 275 class PARQUET_EXPORT ColumnReader {
 276  public:
 277   virtual ~ColumnReader() = default;
 278
 279   // Scan the next array of the indicated size. The actual size of the
 280   // returned array may be less than the passed size depending how much data is
 281   // available in the file.
 282   //
 283   // When all the data in the file has been exhausted, the result is set to
 284   // nullptr.
 285   //
 286   // Returns Status::OK on a successful read, including if you have exhausted
 287   // the data available in the file.
 288   virtual ::arrow::Status NextBatch(int64_t batch_size,
 289                                     std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
 290 };
 291
 292 /// \brief Experimental helper class for bindings (like Python) that struggle
 293 /// either with std::move or C++ exceptions
 294 class PARQUET_EXPORT FileReaderBuilder {
 295  public:
 296   FileReaderBuilder();
 297
 298   /// Create FileReaderBuilder from Arrow file and optional properties / metadata
 299   ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
 300                        const ReaderProperties& properties = default_reader_properties(),
 301                        std::shared_ptr<FileMetaData> metadata = NULLPTR);
 302
 303   ParquetFileReader* raw_reader() { return raw_reader_.get(); }
 304
 305   /// Set Arrow MemoryPool for memory allocation
 306   FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
 307   /// Set Arrow reader properties
 308   FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
 309   /// Build FileReader instance
 310   ::arrow::Status Build(std::unique_ptr<FileReader>* out);
 311
 312  private:
 313   ::arrow::MemoryPool* pool_;
 314   ArrowReaderProperties properties_;
 315   std::unique_ptr<ParquetFileReader> raw_reader_;
 316 };
 317
 318 /// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
 319 ///
 320 /// @{
 321
 322 /// \brief Build FileReader from Arrow file and MemoryPool
 323 ///
 324 /// Advanced settings are supported through the FileReaderBuilder class.
 325 PARQUET_EXPORT
 326 ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
 327                          ::arrow::MemoryPool* allocator,
 328                          std::unique_ptr<FileReader>* reader);
 329
 330 /// @}
 331
 332 PARQUET_EXPORT
 333 ::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
 334                                     std::shared_ptr<::arrow::Scalar>* min,
 335                                     std::shared_ptr<::arrow::Scalar>* max);
 336
 337 namespace internal {
 338
 339 PARQUET_EXPORT
 340 ::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
 341
 342 }  // namespace internal
 343 }  // namespace arrow
 344 }  // namespace parquet