]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/arrow/cpp/src/parquet/arrow/reader_internal.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / arrow / reader_internal.h
diff --git a/ceph/src/arrow/cpp/src/parquet/arrow/reader_internal.h b/ceph/src/arrow/cpp/src/parquet/arrow/reader_internal.h
new file mode 100644 (file)
index 0000000..ad0b781
--- /dev/null
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "parquet/arrow/schema.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class Field;
+class KeyValueMetadata;
+class Schema;
+
+}  // namespace arrow
+
+using arrow::Status;
+
+namespace parquet {
+
+class ArrowReaderProperties;
+
+namespace arrow {
+
+class ColumnReaderImpl;
+
+// ----------------------------------------------------------------------
+// Iteration utilities
+
+// Abstraction to decouple row group iteration details from the ColumnReader,
+// so we can read only a single row group if we want
+class FileColumnIterator {
+ public:
+  explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+                              std::vector<int> row_groups)
+      : column_index_(column_index),
+        reader_(reader),
+        schema_(reader->metadata()->schema()),
+        row_groups_(row_groups.begin(), row_groups.end()) {}
+
+  virtual ~FileColumnIterator() {}
+
+  std::unique_ptr<::parquet::PageReader> NextChunk() {
+    if (row_groups_.empty()) {
+      return nullptr;
+    }
+
+    auto row_group_reader = reader_->RowGroup(row_groups_.front());
+    row_groups_.pop_front();
+    return row_group_reader->GetColumnPageReader(column_index_);
+  }
+
+  const SchemaDescriptor* schema() const { return schema_; }
+
+  const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
+
+  std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
+
+  int column_index() const { return column_index_; }
+
+ protected:
+  int column_index_;
+  ParquetFileReader* reader_;
+  const SchemaDescriptor* schema_;
+  std::deque<int> row_groups_;
+};
+
+using FileColumnIteratorFactory =
+    std::function<FileColumnIterator*(int, ParquetFileReader*)>;
+
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+                          std::shared_ptr<::arrow::DataType> value_type,
+                          const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
+                          std::shared_ptr<::arrow::ChunkedArray>* out);
+
+struct ReaderContext {
+  ParquetFileReader* reader;
+  ::arrow::MemoryPool* pool;
+  FileColumnIteratorFactory iterator_factory;
+  bool filter_leaves;
+  std::shared_ptr<std::unordered_set<int>> included_leaves;
+
+  bool IncludesLeaf(int leaf_index) const {
+    if (this->filter_leaves) {
+      return this->included_leaves->find(leaf_index) != this->included_leaves->end();
+    }
+    return true;
+  }
+};
+
+}  // namespace arrow
+}  // namespace parquet