]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | // This module defines an abstract interface for iterating through pages in a | |
19 | // Parquet column chunk within a row group. It could be extended in the future | |
20 | // to iterate through all data pages in all chunks in a file. | |
21 | ||
22 | #pragma once | |
23 | ||
24 | #include <cstdint> | |
25 | #include <memory> | |
26 | #include <string> | |
27 | ||
28 | #include "parquet/statistics.h" | |
29 | #include "parquet/types.h" | |
30 | ||
31 | namespace parquet { | |
32 | ||
33 | // TODO: Parallel processing is not yet safe because of memory-ownership | |
34 | // semantics (the PageReader may or may not own the memory referenced by a | |
35 | // page) | |
36 | // | |
37 | // TODO(wesm): In the future Parquet implementations may store the crc code | |
38 | // in format::PageHeader. parquet-mr currently does not, so we also skip it | |
39 | // here, both on the read and write path | |
40 | class Page { | |
41 | public: | |
42 | Page(const std::shared_ptr<Buffer>& buffer, PageType::type type) | |
43 | : buffer_(buffer), type_(type) {} | |
44 | ||
45 | PageType::type type() const { return type_; } | |
46 | ||
47 | std::shared_ptr<Buffer> buffer() const { return buffer_; } | |
48 | ||
49 | // @returns: a pointer to the page's data | |
50 | const uint8_t* data() const { return buffer_->data(); } | |
51 | ||
52 | // @returns: the total size in bytes of the page's data buffer | |
53 | int32_t size() const { return static_cast<int32_t>(buffer_->size()); } | |
54 | ||
55 | private: | |
56 | std::shared_ptr<Buffer> buffer_; | |
57 | PageType::type type_; | |
58 | }; | |
59 | ||
60 | /// \brief Base type for DataPageV1 and DataPageV2 including common attributes | |
61 | class DataPage : public Page { | |
62 | public: | |
63 | int32_t num_values() const { return num_values_; } | |
64 | Encoding::type encoding() const { return encoding_; } | |
65 | int64_t uncompressed_size() const { return uncompressed_size_; } | |
66 | const EncodedStatistics& statistics() const { return statistics_; } | |
67 | ||
68 | virtual ~DataPage() = default; | |
69 | ||
70 | protected: | |
71 | DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values, | |
72 | Encoding::type encoding, int64_t uncompressed_size, | |
73 | const EncodedStatistics& statistics = EncodedStatistics()) | |
74 | : Page(buffer, type), | |
75 | num_values_(num_values), | |
76 | encoding_(encoding), | |
77 | uncompressed_size_(uncompressed_size), | |
78 | statistics_(statistics) {} | |
79 | ||
80 | int32_t num_values_; | |
81 | Encoding::type encoding_; | |
82 | int64_t uncompressed_size_; | |
83 | EncodedStatistics statistics_; | |
84 | }; | |
85 | ||
86 | class DataPageV1 : public DataPage { | |
87 | public: | |
88 | DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values, | |
89 | Encoding::type encoding, Encoding::type definition_level_encoding, | |
90 | Encoding::type repetition_level_encoding, int64_t uncompressed_size, | |
91 | const EncodedStatistics& statistics = EncodedStatistics()) | |
92 | : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, | |
93 | statistics), | |
94 | definition_level_encoding_(definition_level_encoding), | |
95 | repetition_level_encoding_(repetition_level_encoding) {} | |
96 | ||
97 | Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } | |
98 | ||
99 | Encoding::type definition_level_encoding() const { return definition_level_encoding_; } | |
100 | ||
101 | private: | |
102 | Encoding::type definition_level_encoding_; | |
103 | Encoding::type repetition_level_encoding_; | |
104 | }; | |
105 | ||
106 | class DataPageV2 : public DataPage { | |
107 | public: | |
108 | DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls, | |
109 | int32_t num_rows, Encoding::type encoding, | |
110 | int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, | |
111 | int64_t uncompressed_size, bool is_compressed = false, | |
112 | const EncodedStatistics& statistics = EncodedStatistics()) | |
113 | : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, | |
114 | statistics), | |
115 | num_nulls_(num_nulls), | |
116 | num_rows_(num_rows), | |
117 | definition_levels_byte_length_(definition_levels_byte_length), | |
118 | repetition_levels_byte_length_(repetition_levels_byte_length), | |
119 | is_compressed_(is_compressed) {} | |
120 | ||
121 | int32_t num_nulls() const { return num_nulls_; } | |
122 | ||
123 | int32_t num_rows() const { return num_rows_; } | |
124 | ||
125 | int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } | |
126 | ||
127 | int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } | |
128 | ||
129 | bool is_compressed() const { return is_compressed_; } | |
130 | ||
131 | private: | |
132 | int32_t num_nulls_; | |
133 | int32_t num_rows_; | |
134 | int32_t definition_levels_byte_length_; | |
135 | int32_t repetition_levels_byte_length_; | |
136 | bool is_compressed_; | |
137 | }; | |
138 | ||
139 | class DictionaryPage : public Page { | |
140 | public: | |
141 | DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values, | |
142 | Encoding::type encoding, bool is_sorted = false) | |
143 | : Page(buffer, PageType::DICTIONARY_PAGE), | |
144 | num_values_(num_values), | |
145 | encoding_(encoding), | |
146 | is_sorted_(is_sorted) {} | |
147 | ||
148 | int32_t num_values() const { return num_values_; } | |
149 | ||
150 | Encoding::type encoding() const { return encoding_; } | |
151 | ||
152 | bool is_sorted() const { return is_sorted_; } | |
153 | ||
154 | private: | |
155 | int32_t num_values_; | |
156 | Encoding::type encoding_; | |
157 | bool is_sorted_; | |
158 | }; | |
159 | ||
160 | } // namespace parquet |