# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # distutils: language = c++ # cython: language_level = 3 from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus, CTable, CMemoryPool, CBuffer, CKeyValueMetadata, CRandomAccessFile, COutputStream, TimeUnit, CRecordBatchReader) from pyarrow.lib cimport _Weakrefable cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: cdef cppclass Node: pass cdef cppclass GroupNode(Node): pass cdef cppclass PrimitiveNode(Node): pass cdef cppclass ColumnPath: c_string ToDotString() vector[c_string] ToDotVector() cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: enum ParquetType" parquet::Type::type": ParquetType_BOOLEAN" parquet::Type::BOOLEAN" ParquetType_INT32" parquet::Type::INT32" ParquetType_INT64" parquet::Type::INT64" ParquetType_INT96" parquet::Type::INT96" ParquetType_FLOAT" parquet::Type::FLOAT" ParquetType_DOUBLE" parquet::Type::DOUBLE" ParquetType_BYTE_ARRAY" parquet::Type::BYTE_ARRAY" ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY" enum ParquetLogicalTypeId" parquet::LogicalType::Type::type": ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED" ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING" ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP" ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST" ParquetLogicalType_ENUM" parquet::LogicalType::Type::ENUM" ParquetLogicalType_DECIMAL" parquet::LogicalType::Type::DECIMAL" ParquetLogicalType_DATE" parquet::LogicalType::Type::DATE" ParquetLogicalType_TIME" parquet::LogicalType::Type::TIME" ParquetLogicalType_TIMESTAMP" parquet::LogicalType::Type::TIMESTAMP" ParquetLogicalType_INT" parquet::LogicalType::Type::INT" ParquetLogicalType_JSON" parquet::LogicalType::Type::JSON" ParquetLogicalType_BSON" parquet::LogicalType::Type::BSON" ParquetLogicalType_UUID" parquet::LogicalType::Type::UUID" ParquetLogicalType_NONE" parquet::LogicalType::Type::NONE" enum ParquetTimeUnit" parquet::LogicalType::TimeUnit::unit": ParquetTimeUnit_UNKNOWN" parquet::LogicalType::TimeUnit::UNKNOWN" ParquetTimeUnit_MILLIS" parquet::LogicalType::TimeUnit::MILLIS" ParquetTimeUnit_MICROS" parquet::LogicalType::TimeUnit::MICROS" ParquetTimeUnit_NANOS" parquet::LogicalType::TimeUnit::NANOS" enum ParquetConvertedType" parquet::ConvertedType::type": ParquetConvertedType_NONE" parquet::ConvertedType::NONE" ParquetConvertedType_UTF8" parquet::ConvertedType::UTF8" ParquetConvertedType_MAP" parquet::ConvertedType::MAP" ParquetConvertedType_MAP_KEY_VALUE \ " parquet::ConvertedType::MAP_KEY_VALUE" ParquetConvertedType_LIST" parquet::ConvertedType::LIST" ParquetConvertedType_ENUM" parquet::ConvertedType::ENUM" ParquetConvertedType_DECIMAL" parquet::ConvertedType::DECIMAL" ParquetConvertedType_DATE" parquet::ConvertedType::DATE" ParquetConvertedType_TIME_MILLIS" parquet::ConvertedType::TIME_MILLIS" ParquetConvertedType_TIME_MICROS" parquet::ConvertedType::TIME_MICROS" ParquetConvertedType_TIMESTAMP_MILLIS \ " parquet::ConvertedType::TIMESTAMP_MILLIS" ParquetConvertedType_TIMESTAMP_MICROS \ " parquet::ConvertedType::TIMESTAMP_MICROS" ParquetConvertedType_UINT_8" parquet::ConvertedType::UINT_8" ParquetConvertedType_UINT_16" parquet::ConvertedType::UINT_16" ParquetConvertedType_UINT_32" parquet::ConvertedType::UINT_32" ParquetConvertedType_UINT_64" parquet::ConvertedType::UINT_64" ParquetConvertedType_INT_8" parquet::ConvertedType::INT_8" ParquetConvertedType_INT_16" parquet::ConvertedType::INT_16" ParquetConvertedType_INT_32" parquet::ConvertedType::INT_32" ParquetConvertedType_INT_64" parquet::ConvertedType::INT_64" ParquetConvertedType_JSON" parquet::ConvertedType::JSON" ParquetConvertedType_BSON" parquet::ConvertedType::BSON" ParquetConvertedType_INTERVAL" parquet::ConvertedType::INTERVAL" enum ParquetRepetition" parquet::Repetition::type": ParquetRepetition_REQUIRED" parquet::REPETITION::REQUIRED" ParquetRepetition_OPTIONAL" parquet::REPETITION::OPTIONAL" ParquetRepetition_REPEATED" parquet::REPETITION::REPEATED" enum ParquetEncoding" parquet::Encoding::type": ParquetEncoding_PLAIN" parquet::Encoding::PLAIN" ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY" ParquetEncoding_RLE" parquet::Encoding::RLE" ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED" ParquetEncoding_DELTA_BINARY_PACKED \ " parquet::Encoding::DELTA_BINARY_PACKED" ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \ " parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY" ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" ParquetEncoding_BYTE_STREAM_SPLIT \ " parquet::Encoding::BYTE_STREAM_SPLIT" enum ParquetCompression" parquet::Compression::type": ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED" ParquetCompression_SNAPPY" parquet::Compression::SNAPPY" ParquetCompression_GZIP" parquet::Compression::GZIP" ParquetCompression_LZO" parquet::Compression::LZO" ParquetCompression_BROTLI" parquet::Compression::BROTLI" ParquetCompression_LZ4" parquet::Compression::LZ4" ParquetCompression_ZSTD" parquet::Compression::ZSTD" enum ParquetVersion" parquet::ParquetVersion::type": ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0" ParquetVersion_V2_0" parquet::ParquetVersion::PARQUET_2_0" ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4" ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6" enum ParquetSortOrder" parquet::SortOrder::type": ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED" ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED" ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN" cdef cppclass CParquetLogicalType" parquet::LogicalType": c_string ToString() const c_string ToJSON() const ParquetLogicalTypeId type() const cdef cppclass CParquetDecimalType \ " parquet::DecimalLogicalType"(CParquetLogicalType): int32_t precision() const int32_t scale() const cdef cppclass CParquetIntType \ " parquet::IntLogicalType"(CParquetLogicalType): int bit_width() const c_bool is_signed() const cdef cppclass CParquetTimeType \ " parquet::TimeLogicalType"(CParquetLogicalType): c_bool is_adjusted_to_utc() const ParquetTimeUnit time_unit() const cdef cppclass CParquetTimestampType \ " parquet::TimestampLogicalType"(CParquetLogicalType): c_bool is_adjusted_to_utc() const ParquetTimeUnit time_unit() const cdef cppclass ColumnDescriptor" parquet::ColumnDescriptor": c_bool Equals(const ColumnDescriptor& other) shared_ptr[ColumnPath] path() int16_t max_definition_level() int16_t max_repetition_level() ParquetType physical_type() const shared_ptr[const CParquetLogicalType]& logical_type() ParquetConvertedType converted_type() const c_string& name() int type_length() int type_precision() int type_scale() cdef cppclass SchemaDescriptor: const ColumnDescriptor* Column(int i) shared_ptr[Node] schema() GroupNode* group() c_bool Equals(const SchemaDescriptor& other) c_string ToString() int num_columns() cdef c_string FormatStatValue(ParquetType parquet_type, c_string val) cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: pass cdef cppclass BoolReader(ColumnReader): pass cdef cppclass Int32Reader(ColumnReader): pass cdef cppclass Int64Reader(ColumnReader): pass cdef cppclass Int96Reader(ColumnReader): pass cdef cppclass FloatReader(ColumnReader): pass cdef cppclass DoubleReader(ColumnReader): pass cdef cppclass ByteArrayReader(ColumnReader): pass cdef cppclass RowGroupReader: pass cdef cppclass CEncodedStatistics" parquet::EncodedStatistics": const c_string& max() const const c_string& min() const int64_t null_count int64_t distinct_count bint has_min bint has_max bint has_null_count bint has_distinct_count cdef cppclass ParquetByteArray" parquet::ByteArray": uint32_t len const uint8_t* ptr cdef cppclass ParquetFLBA" parquet::FLBA": const uint8_t* ptr cdef cppclass CStatistics" parquet::Statistics": int64_t null_count() const int64_t distinct_count() const int64_t num_values() const bint HasMinMax() bint HasNullCount() bint HasDistinctCount() c_bool Equals(const CStatistics&) const void Reset() c_string EncodeMin() c_string EncodeMax() CEncodedStatistics Encode() void SetComparator() ParquetType physical_type() const const ColumnDescriptor* descr() const cdef cppclass CBoolStatistics" parquet::BoolStatistics"(CStatistics): c_bool min() c_bool max() cdef cppclass CInt32Statistics" parquet::Int32Statistics"(CStatistics): int32_t min() int32_t max() cdef cppclass CInt64Statistics" parquet::Int64Statistics"(CStatistics): int64_t min() int64_t max() cdef cppclass CFloatStatistics" parquet::FloatStatistics"(CStatistics): float min() float max() cdef cppclass CDoubleStatistics" parquet::DoubleStatistics"(CStatistics): double min() double max() cdef cppclass CByteArrayStatistics \ " parquet::ByteArrayStatistics"(CStatistics): ParquetByteArray min() ParquetByteArray max() cdef cppclass CFLBAStatistics" parquet::FLBAStatistics"(CStatistics): ParquetFLBA min() ParquetFLBA max() cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData": int64_t file_offset() const const c_string& file_path() const ParquetType type() const int64_t num_values() const shared_ptr[ColumnPath] path_in_schema() const bint is_stats_set() const shared_ptr[CStatistics] statistics() const ParquetCompression compression() const const vector[ParquetEncoding]& encodings() const c_bool Equals(const CColumnChunkMetaData&) const int64_t has_dictionary_page() const int64_t dictionary_page_offset() const int64_t data_page_offset() const int64_t index_page_offset() const int64_t total_compressed_size() const int64_t total_uncompressed_size() const cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": c_bool Equals(const CRowGroupMetaData&) const int num_columns() int64_t num_rows() int64_t total_byte_size() unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const cdef cppclass CFileMetaData" parquet::FileMetaData": c_bool Equals(const CFileMetaData&) const uint32_t size() int num_columns() int64_t num_rows() int num_row_groups() ParquetVersion version() const c_string created_by() int num_schema_elements() void set_file_path(const c_string& path) void AppendRowGroups(const CFileMetaData& other) except + unique_ptr[CRowGroupMetaData] RowGroup(int i) const SchemaDescriptor* schema() shared_ptr[const CKeyValueMetadata] key_value_metadata() const void WriteTo(COutputStream* dst) const cdef shared_ptr[CFileMetaData] CFileMetaData_Make \ " parquet::FileMetaData::Make"(const void* serialized_metadata, uint32_t* metadata_len) cdef cppclass CReaderProperties" parquet::ReaderProperties": c_bool is_buffered_stream_enabled() const void enable_buffered_stream() void disable_buffered_stream() void set_buffer_size(int64_t buf_size) int64_t buffer_size() const CReaderProperties default_reader_properties() cdef cppclass ArrowReaderProperties: ArrowReaderProperties() void set_read_dictionary(int column_index, c_bool read_dict) c_bool read_dictionary() void set_batch_size(int64_t batch_size) int64_t batch_size() void set_pre_buffer(c_bool pre_buffer) c_bool pre_buffer() const void set_coerce_int96_timestamp_unit(TimeUnit unit) TimeUnit coerce_int96_timestamp_unit() const ArrowReaderProperties default_arrow_reader_properties() cdef cppclass ParquetFileReader: shared_ptr[CFileMetaData] metadata() cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: cdef cppclass WriterProperties: cppclass Builder: Builder* data_page_version(ParquetDataPageVersion version) Builder* version(ParquetVersion version) Builder* compression(ParquetCompression codec) Builder* compression(const c_string& path, ParquetCompression codec) Builder* compression_level(int compression_level) Builder* compression_level(const c_string& path, int compression_level) Builder* disable_dictionary() Builder* enable_dictionary() Builder* enable_dictionary(const c_string& path) Builder* disable_statistics() Builder* enable_statistics() Builder* enable_statistics(const c_string& path) Builder* data_pagesize(int64_t size) Builder* encoding(ParquetEncoding encoding) Builder* encoding(const c_string& path, ParquetEncoding encoding) Builder* write_batch_size(int64_t batch_size) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: cppclass Builder: Builder() Builder* disable_deprecated_int96_timestamps() Builder* enable_deprecated_int96_timestamps() Builder* coerce_timestamps(TimeUnit unit) Builder* allow_truncated_timestamps() Builder* disallow_truncated_timestamps() Builder* store_schema() Builder* enable_compliant_nested_types() Builder* disable_compliant_nested_types() Builder* set_engine_version(ArrowWriterEngineVersion version) shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) CStatus GetSchema(shared_ptr[CSchema]* out) CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out) CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) int num_row_groups() CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) CStatus ReadRowGroup(int i, const vector[int]& column_indices, shared_ptr[CTable]* out) CStatus ReadRowGroups(const vector[int]& row_groups, shared_ptr[CTable]* out) CStatus ReadRowGroups(const vector[int]& row_groups, const vector[int]& column_indices, shared_ptr[CTable]* out) CStatus GetRecordBatchReader(const vector[int]& row_group_indices, const vector[int]& column_indices, unique_ptr[CRecordBatchReader]* out) CStatus GetRecordBatchReader(const vector[int]& row_group_indices, unique_ptr[CRecordBatchReader]* out) CStatus ReadTable(shared_ptr[CTable]* out) CStatus ReadTable(const vector[int]& column_indices, shared_ptr[CTable]* out) CStatus ScanContents(vector[int] columns, int32_t column_batch_size, int64_t* num_rows) const ParquetFileReader* parquet_reader() void set_use_threads(c_bool use_threads) void set_batch_size(int64_t batch_size) cdef cppclass FileReaderBuilder: FileReaderBuilder() CStatus Open(const shared_ptr[CRandomAccessFile]& file, const CReaderProperties& properties, const shared_ptr[CFileMetaData]& metadata) ParquetFileReader* raw_reader() FileReaderBuilder* memory_pool(CMemoryPool*) FileReaderBuilder* properties(const ArrowReaderProperties&) CStatus Build(unique_ptr[FileReader]* out) CStatus FromParquetSchema( const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, const shared_ptr[const CKeyValueMetadata]& key_value_metadata, shared_ptr[CSchema]* out) cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: CStatus ToParquetSchema( const CSchema* arrow_schema, const ArrowReaderProperties& properties, const shared_ptr[const CKeyValueMetadata]& key_value_metadata, shared_ptr[SchemaDescriptor]* out) cdef extern from "parquet/properties.h" namespace "parquet" nogil: cdef enum ArrowWriterEngineVersion: V1 "parquet::ArrowWriterProperties::V1", V2 "parquet::ArrowWriterProperties::V2" cdef cppclass ParquetDataPageVersion: pass cdef ParquetDataPageVersion ParquetDataPageVersion_V1 \ " parquet::ParquetDataPageVersion::V1" cdef ParquetDataPageVersion ParquetDataPageVersion_V2 \ " parquet::ParquetDataPageVersion::V2" cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: cdef cppclass FileWriter: @staticmethod CStatus Open(const CSchema& schema, CMemoryPool* pool, const shared_ptr[COutputStream]& sink, const shared_ptr[WriterProperties]& properties, const shared_ptr[ArrowWriterProperties]& arrow_properties, unique_ptr[FileWriter]* writer) CStatus WriteTable(const CTable& table, int64_t chunk_size) CStatus NewRowGroup(int64_t chunk_size) CStatus Close() const shared_ptr[CFileMetaData] metadata() const CStatus WriteMetaDataFile( const CFileMetaData& file_metadata, const COutputStream* sink) cdef shared_ptr[WriterProperties] _create_writer_properties( use_dictionary=*, compression=*, version=*, write_statistics=*, data_page_size=*, compression_level=*, use_byte_stream_split=*, data_page_version=*) except * cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( use_deprecated_int96_timestamps=*, coerce_timestamps=*, allow_truncated_timestamps=*, writer_engine_version=*, use_compliant_nested_type=*) except * cdef class ParquetSchema(_Weakrefable): cdef: FileMetaData parent # the FileMetaData owning the SchemaDescriptor const SchemaDescriptor* schema cdef class FileMetaData(_Weakrefable): cdef: shared_ptr[CFileMetaData] sp_metadata CFileMetaData* _metadata ParquetSchema _schema cdef inline init(self, const shared_ptr[CFileMetaData]& metadata): self.sp_metadata = metadata self._metadata = metadata.get() cdef class RowGroupMetaData(_Weakrefable): cdef: int index # for pickling support unique_ptr[CRowGroupMetaData] up_metadata CRowGroupMetaData* metadata FileMetaData parent cdef class ColumnChunkMetaData(_Weakrefable): cdef: unique_ptr[CColumnChunkMetaData] up_metadata CColumnChunkMetaData* metadata RowGroupMetaData parent cdef inline init(self, RowGroupMetaData parent, int i): self.up_metadata = parent.metadata.ColumnChunk(i) self.metadata = self.up_metadata.get() self.parent = parent cdef class Statistics(_Weakrefable): cdef: shared_ptr[CStatistics] statistics ColumnChunkMetaData parent cdef inline init(self, const shared_ptr[CStatistics]& statistics, ColumnChunkMetaData parent): self.statistics = statistics self.parent = parent