]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/printer.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / printer.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "parquet/printer.h"
19
20#include <cstdint>
21#include <cstdio>
22#include <memory>
23#include <ostream>
24#include <string>
25#include <vector>
26
27#include "arrow/util/key_value_metadata.h"
28#include "arrow/util/string.h"
29
30#include "parquet/column_scanner.h"
31#include "parquet/exception.h"
32#include "parquet/file_reader.h"
33#include "parquet/metadata.h"
34#include "parquet/schema.h"
35#include "parquet/statistics.h"
36#include "parquet/types.h"
37
38namespace parquet {
39
40class ColumnReader;
41
42// ----------------------------------------------------------------------
43// ParquetFilePrinter::DebugPrint
44
45// the fixed initial size is just for an example
46#define COL_WIDTH 30
47
48void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
49 bool print_values, bool format_dump,
50 bool print_key_value_metadata, const char* filename) {
51 const FileMetaData* file_metadata = fileReader->metadata().get();
52
53 stream << "File Name: " << filename << "\n";
54 stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
55 stream << "Created By: " << file_metadata->created_by() << "\n";
56 stream << "Total rows: " << file_metadata->num_rows() << "\n";
57
58 if (print_key_value_metadata && file_metadata->key_value_metadata()) {
59 auto key_value_metadata = file_metadata->key_value_metadata();
60 int64_t size_of_key_value_metadata = key_value_metadata->size();
61 stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
62 for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
63 stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
64 << key_value_metadata->value(i) << "\n";
65 }
66 }
67
68 stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
69 stream << "Number of Real Columns: "
70 << file_metadata->schema()->group_node()->field_count() << "\n";
71
72 if (selected_columns.size() == 0) {
73 for (int i = 0; i < file_metadata->num_columns(); i++) {
74 selected_columns.push_back(i);
75 }
76 } else {
77 for (auto i : selected_columns) {
78 if (i < 0 || i >= file_metadata->num_columns()) {
79 throw ParquetException("Selected column is out of range");
80 }
81 }
82 }
83
84 stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
85 stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
86 for (auto i : selected_columns) {
87 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
88 stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
89 << TypeToString(descr->physical_type());
90 const auto& logical_type = descr->logical_type();
91 if (!logical_type->is_none()) {
92 stream << " / " << logical_type->ToString();
93 }
94 if (descr->converted_type() != ConvertedType::NONE) {
95 stream << " / " << ConvertedTypeToString(descr->converted_type());
96 if (descr->converted_type() == ConvertedType::DECIMAL) {
97 stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
98 }
99 }
100 stream << ")" << std::endl;
101 }
102
103 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
104 stream << "--- Row Group: " << r << " ---\n";
105
106 auto group_reader = fileReader->RowGroup(r);
107 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
108
109 stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
110 stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
111 << " ---\n";
112 stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
113
114 // Print column metadata
115 for (auto i : selected_columns) {
116 auto column_chunk = group_metadata->ColumnChunk(i);
117 std::shared_ptr<Statistics> stats = column_chunk->statistics();
118
119 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
120 stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
121 if (column_chunk->is_stats_set()) {
122 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
123 stream << ", Null Values: " << stats->null_count()
124 << ", Distinct Values: " << stats->distinct_count() << std::endl
125 << " Max: " << FormatStatValue(descr->physical_type(), max)
126 << ", Min: " << FormatStatValue(descr->physical_type(), min);
127 } else {
128 stream << " Statistics Not Set";
129 }
130 stream << std::endl
131 << " Compression: "
132 << ::arrow::internal::AsciiToUpper(
133 Codec::GetCodecAsString(column_chunk->compression()))
134 << ", Encodings:";
135 for (auto encoding : column_chunk->encodings()) {
136 stream << " " << EncodingToString(encoding);
137 }
138 stream << std::endl
139 << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
140 << ", Compressed Size: " << column_chunk->total_compressed_size()
141 << std::endl;
142 }
143
144 if (!print_values) {
145 continue;
146 }
147 stream << "--- Values ---\n";
148
149 static constexpr int bufsize = COL_WIDTH + 1;
150 char buffer[bufsize];
151
152 // Create readers for selected columns and print contents
153 std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
154 int j = 0;
155 for (auto i : selected_columns) {
156 std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
157 // This is OK in this method as long as the RowGroupReader does not get
158 // deleted
159 auto& scanner = scanners[j++] = Scanner::Make(col_reader);
160
161 if (format_dump) {
162 stream << "Column " << i << std::endl;
163 while (scanner->HasNext()) {
164 scanner->PrintNext(stream, 0, true);
165 stream << "\n";
166 }
167 continue;
168 }
169
170 snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
171 file_metadata->schema()->Column(i)->name().c_str());
172 stream << buffer << '|';
173 }
174 if (format_dump) {
175 continue;
176 }
177 stream << "\n";
178
179 bool hasRow;
180 do {
181 hasRow = false;
182 for (auto scanner : scanners) {
183 if (scanner->HasNext()) {
184 hasRow = true;
185 scanner->PrintNext(stream, COL_WIDTH);
186 stream << '|';
187 }
188 }
189 stream << "\n";
190 } while (hasRow);
191 }
192}
193
194void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
195 const char* filename) {
196 const FileMetaData* file_metadata = fileReader->metadata().get();
197 stream << "{\n";
198 stream << " \"FileName\": \"" << filename << "\",\n";
199 stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
200 << "\",\n";
201 stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
202 stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
203 stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
204 stream << " \"NumberOfRealColumns\": \""
205 << file_metadata->schema()->group_node()->field_count() << "\",\n";
206 stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
207
208 if (selected_columns.size() == 0) {
209 for (int i = 0; i < file_metadata->num_columns(); i++) {
210 selected_columns.push_back(i);
211 }
212 } else {
213 for (auto i : selected_columns) {
214 if (i < 0 || i >= file_metadata->num_columns()) {
215 throw ParquetException("Selected column is out of range");
216 }
217 }
218 }
219
220 stream << " \"Columns\": [\n";
221 int c = 0;
222 for (auto i : selected_columns) {
223 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
224 stream << " { \"Id\": \"" << i << "\","
225 << " \"Name\": \"" << descr->path()->ToDotString() << "\","
226 << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
227 << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
228 << "\","
229 << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
230 c++;
231 if (c != static_cast<int>(selected_columns.size())) {
232 stream << ",\n";
233 }
234 }
235
236 stream << "\n ],\n \"RowGroups\": [\n";
237 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
238 stream << " {\n \"Id\": \"" << r << "\", ";
239
240 auto group_reader = fileReader->RowGroup(r);
241 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
242
243 stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
244 stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
245 << "\", ";
246 stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
247
248 // Print column metadata
249 stream << " \"ColumnChunks\": [\n";
250 int c1 = 0;
251 for (auto i : selected_columns) {
252 auto column_chunk = group_metadata->ColumnChunk(i);
253 std::shared_ptr<Statistics> stats = column_chunk->statistics();
254
255 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
256 stream << " {\"Id\": \"" << i << "\", \"Values\": \""
257 << column_chunk->num_values() << "\", "
258 << "\"StatsSet\": ";
259 if (column_chunk->is_stats_set()) {
260 stream << "\"True\", \"Stats\": {";
261 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
262 stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
263 << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
264 << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
265 << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
266 << "\" },";
267 } else {
268 stream << "\"False\",";
269 }
270 stream << "\n \"Compression\": \""
271 << ::arrow::internal::AsciiToUpper(
272 Codec::GetCodecAsString(column_chunk->compression()))
273 << "\", \"Encodings\": \"";
274 for (auto encoding : column_chunk->encodings()) {
275 stream << EncodingToString(encoding) << " ";
276 }
277 stream << "\", "
278 << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
279 << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
280
281 // end of a ColumnChunk
282 stream << "\" }";
283 c1++;
284 if (c1 != static_cast<int>(selected_columns.size())) {
285 stream << ",\n";
286 }
287 }
288
289 stream << "\n ]\n }";
290 if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
291 stream << ",\n";
292 }
293 }
294 stream << "\n ]\n}\n";
295}
296
297} // namespace parquet