]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/column_page.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / column_page.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// This module defines an abstract interface for iterating through pages in a
19// Parquet column chunk within a row group. It could be extended in the future
20// to iterate through all data pages in all chunks in a file.
21
22#pragma once
23
24#include <cstdint>
25#include <memory>
26#include <string>
27
28#include "parquet/statistics.h"
29#include "parquet/types.h"
30
31namespace parquet {
32
33// TODO: Parallel processing is not yet safe because of memory-ownership
34// semantics (the PageReader may or may not own the memory referenced by a
35// page)
36//
37// TODO(wesm): In the future Parquet implementations may store the crc code
38// in format::PageHeader. parquet-mr currently does not, so we also skip it
39// here, both on the read and write path
40class Page {
41 public:
42 Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
43 : buffer_(buffer), type_(type) {}
44
45 PageType::type type() const { return type_; }
46
47 std::shared_ptr<Buffer> buffer() const { return buffer_; }
48
49 // @returns: a pointer to the page's data
50 const uint8_t* data() const { return buffer_->data(); }
51
52 // @returns: the total size in bytes of the page's data buffer
53 int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
54
55 private:
56 std::shared_ptr<Buffer> buffer_;
57 PageType::type type_;
58};
59
60/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
61class DataPage : public Page {
62 public:
63 int32_t num_values() const { return num_values_; }
64 Encoding::type encoding() const { return encoding_; }
65 int64_t uncompressed_size() const { return uncompressed_size_; }
66 const EncodedStatistics& statistics() const { return statistics_; }
67
68 virtual ~DataPage() = default;
69
70 protected:
71 DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
72 Encoding::type encoding, int64_t uncompressed_size,
73 const EncodedStatistics& statistics = EncodedStatistics())
74 : Page(buffer, type),
75 num_values_(num_values),
76 encoding_(encoding),
77 uncompressed_size_(uncompressed_size),
78 statistics_(statistics) {}
79
80 int32_t num_values_;
81 Encoding::type encoding_;
82 int64_t uncompressed_size_;
83 EncodedStatistics statistics_;
84};
85
86class DataPageV1 : public DataPage {
87 public:
88 DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
89 Encoding::type encoding, Encoding::type definition_level_encoding,
90 Encoding::type repetition_level_encoding, int64_t uncompressed_size,
91 const EncodedStatistics& statistics = EncodedStatistics())
92 : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
93 statistics),
94 definition_level_encoding_(definition_level_encoding),
95 repetition_level_encoding_(repetition_level_encoding) {}
96
97 Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
98
99 Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
100
101 private:
102 Encoding::type definition_level_encoding_;
103 Encoding::type repetition_level_encoding_;
104};
105
106class DataPageV2 : public DataPage {
107 public:
108 DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
109 int32_t num_rows, Encoding::type encoding,
110 int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
111 int64_t uncompressed_size, bool is_compressed = false,
112 const EncodedStatistics& statistics = EncodedStatistics())
113 : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
114 statistics),
115 num_nulls_(num_nulls),
116 num_rows_(num_rows),
117 definition_levels_byte_length_(definition_levels_byte_length),
118 repetition_levels_byte_length_(repetition_levels_byte_length),
119 is_compressed_(is_compressed) {}
120
121 int32_t num_nulls() const { return num_nulls_; }
122
123 int32_t num_rows() const { return num_rows_; }
124
125 int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
126
127 int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
128
129 bool is_compressed() const { return is_compressed_; }
130
131 private:
132 int32_t num_nulls_;
133 int32_t num_rows_;
134 int32_t definition_levels_byte_length_;
135 int32_t repetition_levels_byte_length_;
136 bool is_compressed_;
137};
138
139class DictionaryPage : public Page {
140 public:
141 DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
142 Encoding::type encoding, bool is_sorted = false)
143 : Page(buffer, PageType::DICTIONARY_PAGE),
144 num_values_(num_values),
145 encoding_(encoding),
146 is_sorted_(is_sorted) {}
147
148 int32_t num_values() const { return num_values_; }
149
150 Encoding::type encoding() const { return encoding_; }
151
152 bool is_sorted() const { return is_sorted_; }
153
154 private:
155 int32_t num_values_;
156 Encoding::type encoding_;
157 bool is_sorted_;
158};
159
160} // namespace parquet