]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/dataset/file_benchmark.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / dataset / file_benchmark.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include "arrow/compute/exec/expression.h"
21 #include "arrow/dataset/discovery.h"
22 #include "arrow/dataset/file_base.h"
23 #include "arrow/dataset/file_ipc.h"
24 #include "arrow/dataset/partition.h"
25 #include "arrow/filesystem/mockfs.h"
26 #include "arrow/filesystem/test_util.h"
27 #include "arrow/testing/gtest_util.h"
28 #include "arrow/util/iterator.h"
29
30 namespace arrow {
31 namespace dataset {
32
33 static std::shared_ptr<Dataset> GetDataset() {
34 std::vector<fs::FileInfo> files;
35 std::vector<std::string> paths;
36 for (int a = 0; a < 100; a++) {
37 for (int b = 0; b < 100; b++) {
38 auto path = "a=" + std::to_string(a) + "/b=" + std::to_string(b) + "/data.feather";
39 files.push_back(fs::File(path));
40 paths.push_back(path);
41 }
42 }
43 EXPECT_OK_AND_ASSIGN(auto fs,
44 arrow::fs::internal::MockFileSystem::Make(fs::kNoTime, files));
45 auto format = std::make_shared<IpcFileFormat>();
46 FileSystemFactoryOptions options;
47 options.partitioning = HivePartitioning::MakeFactory();
48 EXPECT_OK_AND_ASSIGN(auto factory,
49 FileSystemDatasetFactory::Make(fs, paths, format, options));
50 FinishOptions finish_options;
51 finish_options.inspect_options.fragments = 0;
52 EXPECT_OK_AND_ASSIGN(auto dataset, factory->Finish(finish_options));
53 return dataset;
54 }
55
56 // A benchmark of filtering fragments in a dataset.
57 static void GetAllFragments(benchmark::State& state) {
58 auto dataset = GetDataset();
59 for (auto _ : state) {
60 ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments());
61 ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return Status::OK(); }));
62 }
63 }
64
65 static void GetFilteredFragments(benchmark::State& state, compute::Expression filter) {
66 auto dataset = GetDataset();
67 ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset->schema()));
68 for (auto _ : state) {
69 ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments(filter));
70 ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return Status::OK(); }));
71 }
72 }
73
74 using compute::field_ref;
75 using compute::literal;
76
77 BENCHMARK(GetAllFragments);
78 // Drill down to a subtree.
79 BENCHMARK_CAPTURE(GetFilteredFragments, single_dir, equal(field_ref("a"), literal(90)));
80 // Drill down, but not to a subtree.
81 BENCHMARK_CAPTURE(GetFilteredFragments, multi_dir, equal(field_ref("b"), literal(90)));
82 // Drill down to a single file.
83 BENCHMARK_CAPTURE(GetFilteredFragments, single_file,
84 and_(equal(field_ref("a"), literal(90)),
85 equal(field_ref("b"), literal(90))));
86 // Apply a filter, but keep most of the files.
87 BENCHMARK_CAPTURE(GetFilteredFragments, range, greater(field_ref("a"), literal(1)));
88
89 } // namespace dataset
90 } // namespace arrow