]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/tools/parquet/parquet_scan.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / tools / parquet / parquet_scan.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <ctime>
19#include <iostream>
20#include <list>
21#include <memory>
22
23#include "parquet/api/reader.h"
24
25int main(int argc, char** argv) {
26 if (argc > 4 || argc < 1) {
27 std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
28 << std::endl;
29 return -1;
30 }
31
32 std::string filename;
33
34 // Read command-line options
35 int batch_size = 256;
36 const std::string COLUMNS_PREFIX = "--columns=";
37 const std::string BATCH_SIZE_PREFIX = "--batch-size=";
38 std::vector<int> columns;
39 int num_columns = 0;
40
41 char *param, *value;
42 for (int i = 1; i < argc; i++) {
43 if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
44 value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
45 while (value) {
46 columns.push_back(std::atoi(value));
47 value = std::strtok(nullptr, ",");
48 num_columns++;
49 }
50 } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
51 value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
52 if (value) {
53 batch_size = std::atoi(value);
54 }
55 } else {
56 filename = argv[i];
57 }
58 }
59
60 try {
61 double total_time;
62 std::clock_t start_time = std::clock();
63 std::unique_ptr<parquet::ParquetFileReader> reader =
64 parquet::ParquetFileReader::OpenFile(filename);
65
66 int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
67
68 total_time = static_cast<double>(std::clock() - start_time) /
69 static_cast<double>(CLOCKS_PER_SEC);
70 std::cout << total_rows << " rows scanned in " << total_time << " seconds."
71 << std::endl;
72 } catch (const std::exception& e) {
73 std::cerr << "Parquet error: " << e.what() << std::endl;
74 return -1;
75 }
76
77 return 0;
78}