]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <cstdint> | |
21 | #include <memory> | |
22 | #include <string> | |
23 | #include <utility> | |
24 | #include <vector> | |
25 | ||
26 | #include "arrow/io/interfaces.h" | |
27 | #include "arrow/util/type_fwd.h" | |
28 | #include "arrow/util/visibility.h" | |
29 | ||
30 | namespace arrow { | |
31 | namespace io { | |
32 | ||
33 | struct ARROW_EXPORT CacheOptions { | |
34 | static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9; | |
35 | static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64; | |
36 | ||
37 | /// \brief The maximum distance in bytes between two consecutive | |
38 | /// ranges; beyond this value, ranges are not combined | |
39 | int64_t hole_size_limit; | |
40 | /// \brief The maximum size in bytes of a combined range; if | |
41 | /// combining two consecutive ranges would produce a range of a | |
42 | /// size greater than this, they are not combined | |
43 | int64_t range_size_limit; | |
44 | /// \brief A lazy cache does not perform any I/O until requested. | |
45 | bool lazy; | |
46 | ||
47 | bool operator==(const CacheOptions& other) const { | |
48 | return hole_size_limit == other.hole_size_limit && | |
49 | range_size_limit == other.range_size_limit && lazy == other.lazy; | |
50 | } | |
51 | ||
52 | /// \brief Construct CacheOptions from network storage metrics (e.g. S3). | |
53 | /// | |
54 | /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in | |
55 | /// milliseconds, also called call setup latency of a new S3 request. | |
56 | /// The value is a positive integer. | |
57 | /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec. | |
58 | /// The value is a positive integer. | |
59 | /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction | |
60 | /// (per connection) to maximize the net data load. | |
61 | /// The value is a positive double precision number less than 1. | |
62 | /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB) | |
63 | /// to maximize the net data load. | |
64 | /// The value is a positive integer. | |
65 | /// \return A new instance of CacheOptions. | |
66 | static CacheOptions MakeFromNetworkMetrics( | |
67 | int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec, | |
68 | double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac, | |
69 | int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib); | |
70 | ||
71 | static CacheOptions Defaults(); | |
72 | static CacheOptions LazyDefaults(); | |
73 | }; | |
74 | ||
75 | namespace internal { | |
76 | ||
77 | /// \brief A read cache designed to hide IO latencies when reading. | |
78 | /// | |
79 | /// This class takes multiple byte ranges that an application expects to read, and | |
80 | /// coalesces them into fewer, larger read requests, which benefits performance on some | |
81 | /// filesystems, particularly remote ones like Amazon S3. By default, it also issues | |
82 | /// these read requests in parallel up front. | |
83 | /// | |
84 | /// To use: | |
85 | /// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have | |
86 | /// the exact offset and length that will later be read. The cache will combine those | |
87 | /// ranges according to parameters (see constructor). | |
88 | /// | |
89 | /// By default, the cache will also start fetching the combined ranges in parallel in | |
90 | /// the background, unless CacheOptions.lazy is set. | |
91 | /// | |
92 | /// 2. Call WaitFor() to be notified when the given ranges have been read. If | |
93 | /// CacheOptions.lazy is set, I/O will be triggered in the background here instead. | |
94 | /// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each | |
95 | /// chunk of the file that can be parsed in parallel). | |
96 | /// | |
97 | /// 3. Call Read() to retrieve the actual data for the given ranges. | |
98 | /// A synchronous application may skip WaitFor() and just call Read() - it will still | |
99 | /// benefit from coalescing and parallel fetching. | |
100 | class ARROW_EXPORT ReadRangeCache { | |
101 | public: | |
102 | static constexpr int64_t kDefaultHoleSizeLimit = 8192; | |
103 | static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024; | |
104 | ||
105 | /// Construct a read cache with default | |
106 | explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx) | |
107 | : ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {} | |
108 | ||
109 | /// Construct a read cache with given options | |
110 | explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx, | |
111 | CacheOptions options); | |
112 | ~ReadRangeCache(); | |
113 | ||
114 | /// \brief Cache the given ranges in the background. | |
115 | /// | |
116 | /// The caller must ensure that the ranges do not overlap with each other, | |
117 | /// nor with previously cached ranges. Otherwise, behaviour will be undefined. | |
118 | Status Cache(std::vector<ReadRange> ranges); | |
119 | ||
120 | /// \brief Read a range previously given to Cache(). | |
121 | Result<std::shared_ptr<Buffer>> Read(ReadRange range); | |
122 | ||
123 | /// \brief Wait until all ranges added so far have been cached. | |
124 | Future<> Wait(); | |
125 | ||
126 | /// \brief Wait until all given ranges have been cached. | |
127 | Future<> WaitFor(std::vector<ReadRange> ranges); | |
128 | ||
129 | protected: | |
130 | struct Impl; | |
131 | struct LazyImpl; | |
132 | ||
133 | std::unique_ptr<Impl> impl_; | |
134 | }; | |
135 | ||
136 | } // namespace internal | |
137 | } // namespace io | |
138 | } // namespace arrow |