]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "parquet/printer.h" | |
19 | ||
20 | #include <cstdint> | |
21 | #include <cstdio> | |
22 | #include <memory> | |
23 | #include <ostream> | |
24 | #include <string> | |
25 | #include <vector> | |
26 | ||
27 | #include "arrow/util/key_value_metadata.h" | |
28 | #include "arrow/util/string.h" | |
29 | ||
30 | #include "parquet/column_scanner.h" | |
31 | #include "parquet/exception.h" | |
32 | #include "parquet/file_reader.h" | |
33 | #include "parquet/metadata.h" | |
34 | #include "parquet/schema.h" | |
35 | #include "parquet/statistics.h" | |
36 | #include "parquet/types.h" | |
37 | ||
38 | namespace parquet { | |
39 | ||
40 | class ColumnReader; | |
41 | ||
42 | // ---------------------------------------------------------------------- | |
43 | // ParquetFilePrinter::DebugPrint | |
44 | ||
45 | // the fixed initial size is just for an example | |
46 | #define COL_WIDTH 30 | |
47 | ||
48 | void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, | |
49 | bool print_values, bool format_dump, | |
50 | bool print_key_value_metadata, const char* filename) { | |
51 | const FileMetaData* file_metadata = fileReader->metadata().get(); | |
52 | ||
53 | stream << "File Name: " << filename << "\n"; | |
54 | stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; | |
55 | stream << "Created By: " << file_metadata->created_by() << "\n"; | |
56 | stream << "Total rows: " << file_metadata->num_rows() << "\n"; | |
57 | ||
58 | if (print_key_value_metadata && file_metadata->key_value_metadata()) { | |
59 | auto key_value_metadata = file_metadata->key_value_metadata(); | |
60 | int64_t size_of_key_value_metadata = key_value_metadata->size(); | |
61 | stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n"; | |
62 | for (int64_t i = 0; i < size_of_key_value_metadata; i++) { | |
63 | stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " | |
64 | << key_value_metadata->value(i) << "\n"; | |
65 | } | |
66 | } | |
67 | ||
68 | stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n"; | |
69 | stream << "Number of Real Columns: " | |
70 | << file_metadata->schema()->group_node()->field_count() << "\n"; | |
71 | ||
72 | if (selected_columns.size() == 0) { | |
73 | for (int i = 0; i < file_metadata->num_columns(); i++) { | |
74 | selected_columns.push_back(i); | |
75 | } | |
76 | } else { | |
77 | for (auto i : selected_columns) { | |
78 | if (i < 0 || i >= file_metadata->num_columns()) { | |
79 | throw ParquetException("Selected column is out of range"); | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
84 | stream << "Number of Columns: " << file_metadata->num_columns() << "\n"; | |
85 | stream << "Number of Selected Columns: " << selected_columns.size() << "\n"; | |
86 | for (auto i : selected_columns) { | |
87 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | |
88 | stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" | |
89 | << TypeToString(descr->physical_type()); | |
90 | const auto& logical_type = descr->logical_type(); | |
91 | if (!logical_type->is_none()) { | |
92 | stream << " / " << logical_type->ToString(); | |
93 | } | |
94 | if (descr->converted_type() != ConvertedType::NONE) { | |
95 | stream << " / " << ConvertedTypeToString(descr->converted_type()); | |
96 | if (descr->converted_type() == ConvertedType::DECIMAL) { | |
97 | stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")"; | |
98 | } | |
99 | } | |
100 | stream << ")" << std::endl; | |
101 | } | |
102 | ||
103 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { | |
104 | stream << "--- Row Group: " << r << " ---\n"; | |
105 | ||
106 | auto group_reader = fileReader->RowGroup(r); | |
107 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); | |
108 | ||
109 | stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n"; | |
110 | stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size() | |
111 | << " ---\n"; | |
112 | stream << "--- Rows: " << group_metadata->num_rows() << " ---\n"; | |
113 | ||
114 | // Print column metadata | |
115 | for (auto i : selected_columns) { | |
116 | auto column_chunk = group_metadata->ColumnChunk(i); | |
117 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); | |
118 | ||
119 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | |
120 | stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values(); | |
121 | if (column_chunk->is_stats_set()) { | |
122 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); | |
123 | stream << ", Null Values: " << stats->null_count() | |
124 | << ", Distinct Values: " << stats->distinct_count() << std::endl | |
125 | << " Max: " << FormatStatValue(descr->physical_type(), max) | |
126 | << ", Min: " << FormatStatValue(descr->physical_type(), min); | |
127 | } else { | |
128 | stream << " Statistics Not Set"; | |
129 | } | |
130 | stream << std::endl | |
131 | << " Compression: " | |
132 | << ::arrow::internal::AsciiToUpper( | |
133 | Codec::GetCodecAsString(column_chunk->compression())) | |
134 | << ", Encodings:"; | |
135 | for (auto encoding : column_chunk->encodings()) { | |
136 | stream << " " << EncodingToString(encoding); | |
137 | } | |
138 | stream << std::endl | |
139 | << " Uncompressed Size: " << column_chunk->total_uncompressed_size() | |
140 | << ", Compressed Size: " << column_chunk->total_compressed_size() | |
141 | << std::endl; | |
142 | } | |
143 | ||
144 | if (!print_values) { | |
145 | continue; | |
146 | } | |
147 | stream << "--- Values ---\n"; | |
148 | ||
149 | static constexpr int bufsize = COL_WIDTH + 1; | |
150 | char buffer[bufsize]; | |
151 | ||
152 | // Create readers for selected columns and print contents | |
153 | std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); | |
154 | int j = 0; | |
155 | for (auto i : selected_columns) { | |
156 | std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); | |
157 | // This is OK in this method as long as the RowGroupReader does not get | |
158 | // deleted | |
159 | auto& scanner = scanners[j++] = Scanner::Make(col_reader); | |
160 | ||
161 | if (format_dump) { | |
162 | stream << "Column " << i << std::endl; | |
163 | while (scanner->HasNext()) { | |
164 | scanner->PrintNext(stream, 0, true); | |
165 | stream << "\n"; | |
166 | } | |
167 | continue; | |
168 | } | |
169 | ||
170 | snprintf(buffer, bufsize, "%-*s", COL_WIDTH, | |
171 | file_metadata->schema()->Column(i)->name().c_str()); | |
172 | stream << buffer << '|'; | |
173 | } | |
174 | if (format_dump) { | |
175 | continue; | |
176 | } | |
177 | stream << "\n"; | |
178 | ||
179 | bool hasRow; | |
180 | do { | |
181 | hasRow = false; | |
182 | for (auto scanner : scanners) { | |
183 | if (scanner->HasNext()) { | |
184 | hasRow = true; | |
185 | scanner->PrintNext(stream, COL_WIDTH); | |
186 | stream << '|'; | |
187 | } | |
188 | } | |
189 | stream << "\n"; | |
190 | } while (hasRow); | |
191 | } | |
192 | } | |
193 | ||
194 | void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, | |
195 | const char* filename) { | |
196 | const FileMetaData* file_metadata = fileReader->metadata().get(); | |
197 | stream << "{\n"; | |
198 | stream << " \"FileName\": \"" << filename << "\",\n"; | |
199 | stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version()) | |
200 | << "\",\n"; | |
201 | stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n"; | |
202 | stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n"; | |
203 | stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n"; | |
204 | stream << " \"NumberOfRealColumns\": \"" | |
205 | << file_metadata->schema()->group_node()->field_count() << "\",\n"; | |
206 | stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n"; | |
207 | ||
208 | if (selected_columns.size() == 0) { | |
209 | for (int i = 0; i < file_metadata->num_columns(); i++) { | |
210 | selected_columns.push_back(i); | |
211 | } | |
212 | } else { | |
213 | for (auto i : selected_columns) { | |
214 | if (i < 0 || i >= file_metadata->num_columns()) { | |
215 | throw ParquetException("Selected column is out of range"); | |
216 | } | |
217 | } | |
218 | } | |
219 | ||
220 | stream << " \"Columns\": [\n"; | |
221 | int c = 0; | |
222 | for (auto i : selected_columns) { | |
223 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | |
224 | stream << " { \"Id\": \"" << i << "\"," | |
225 | << " \"Name\": \"" << descr->path()->ToDotString() << "\"," | |
226 | << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," | |
227 | << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type()) | |
228 | << "\"," | |
229 | << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }"; | |
230 | c++; | |
231 | if (c != static_cast<int>(selected_columns.size())) { | |
232 | stream << ",\n"; | |
233 | } | |
234 | } | |
235 | ||
236 | stream << "\n ],\n \"RowGroups\": [\n"; | |
237 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { | |
238 | stream << " {\n \"Id\": \"" << r << "\", "; | |
239 | ||
240 | auto group_reader = fileReader->RowGroup(r); | |
241 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); | |
242 | ||
243 | stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; | |
244 | stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size() | |
245 | << "\", "; | |
246 | stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; | |
247 | ||
248 | // Print column metadata | |
249 | stream << " \"ColumnChunks\": [\n"; | |
250 | int c1 = 0; | |
251 | for (auto i : selected_columns) { | |
252 | auto column_chunk = group_metadata->ColumnChunk(i); | |
253 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); | |
254 | ||
255 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | |
256 | stream << " {\"Id\": \"" << i << "\", \"Values\": \"" | |
257 | << column_chunk->num_values() << "\", " | |
258 | << "\"StatsSet\": "; | |
259 | if (column_chunk->is_stats_set()) { | |
260 | stream << "\"True\", \"Stats\": {"; | |
261 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); | |
262 | stream << "\"NumNulls\": \"" << stats->null_count() << "\", " | |
263 | << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " | |
264 | << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " | |
265 | << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) | |
266 | << "\" },"; | |
267 | } else { | |
268 | stream << "\"False\","; | |
269 | } | |
270 | stream << "\n \"Compression\": \"" | |
271 | << ::arrow::internal::AsciiToUpper( | |
272 | Codec::GetCodecAsString(column_chunk->compression())) | |
273 | << "\", \"Encodings\": \""; | |
274 | for (auto encoding : column_chunk->encodings()) { | |
275 | stream << EncodingToString(encoding) << " "; | |
276 | } | |
277 | stream << "\", " | |
278 | << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() | |
279 | << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); | |
280 | ||
281 | // end of a ColumnChunk | |
282 | stream << "\" }"; | |
283 | c1++; | |
284 | if (c1 != static_cast<int>(selected_columns.size())) { | |
285 | stream << ",\n"; | |
286 | } | |
287 | } | |
288 | ||
289 | stream << "\n ]\n }"; | |
290 | if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { | |
291 | stream << ",\n"; | |
292 | } | |
293 | } | |
294 | stream << "\n ]\n}\n"; | |
295 | } | |
296 | ||
297 | } // namespace parquet |