1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "benchmark/benchmark.h"
20 #include "arrow/compute/exec/expression.h"
21 #include "arrow/dataset/discovery.h"
22 #include "arrow/dataset/file_base.h"
23 #include "arrow/dataset/file_ipc.h"
24 #include "arrow/dataset/partition.h"
25 #include "arrow/filesystem/mockfs.h"
26 #include "arrow/filesystem/test_util.h"
27 #include "arrow/testing/gtest_util.h"
28 #include "arrow/util/iterator.h"
33 static std::shared_ptr
<Dataset
> GetDataset() {
34 std::vector
<fs::FileInfo
> files
;
35 std::vector
<std::string
> paths
;
36 for (int a
= 0; a
< 100; a
++) {
37 for (int b
= 0; b
< 100; b
++) {
38 auto path
= "a=" + std::to_string(a
) + "/b=" + std::to_string(b
) + "/data.feather";
39 files
.push_back(fs::File(path
));
40 paths
.push_back(path
);
43 EXPECT_OK_AND_ASSIGN(auto fs
,
44 arrow::fs::internal::MockFileSystem::Make(fs::kNoTime
, files
));
45 auto format
= std::make_shared
<IpcFileFormat
>();
46 FileSystemFactoryOptions options
;
47 options
.partitioning
= HivePartitioning::MakeFactory();
48 EXPECT_OK_AND_ASSIGN(auto factory
,
49 FileSystemDatasetFactory::Make(fs
, paths
, format
, options
));
50 FinishOptions finish_options
;
51 finish_options
.inspect_options
.fragments
= 0;
52 EXPECT_OK_AND_ASSIGN(auto dataset
, factory
->Finish(finish_options
));
56 // A benchmark of filtering fragments in a dataset.
57 static void GetAllFragments(benchmark::State
& state
) {
58 auto dataset
= GetDataset();
59 for (auto _
: state
) {
60 ASSERT_OK_AND_ASSIGN(auto fragments
, dataset
->GetFragments());
61 ABORT_NOT_OK(fragments
.Visit([](std::shared_ptr
<Fragment
>) { return Status::OK(); }));
65 static void GetFilteredFragments(benchmark::State
& state
, compute::Expression filter
) {
66 auto dataset
= GetDataset();
67 ASSERT_OK_AND_ASSIGN(filter
, filter
.Bind(*dataset
->schema()));
68 for (auto _
: state
) {
69 ASSERT_OK_AND_ASSIGN(auto fragments
, dataset
->GetFragments(filter
));
70 ABORT_NOT_OK(fragments
.Visit([](std::shared_ptr
<Fragment
>) { return Status::OK(); }));
74 using compute::field_ref
;
75 using compute::literal
;
77 BENCHMARK(GetAllFragments
);
78 // Drill down to a subtree.
79 BENCHMARK_CAPTURE(GetFilteredFragments
, single_dir
, equal(field_ref("a"), literal(90)));
80 // Drill down, but not to a subtree.
81 BENCHMARK_CAPTURE(GetFilteredFragments
, multi_dir
, equal(field_ref("b"), literal(90)));
82 // Drill down to a single file.
83 BENCHMARK_CAPTURE(GetFilteredFragments
, single_file
,
84 and_(equal(field_ref("a"), literal(90)),
85 equal(field_ref("b"), literal(90))));
86 // Apply a filter, but keep most of the files.
87 BENCHMARK_CAPTURE(GetFilteredFragments
, range
, greater(field_ref("a"), literal(1)));
89 } // namespace dataset