]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/io/caching.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / io / caching.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cstdint>
21#include <memory>
22#include <string>
23#include <utility>
24#include <vector>
25
26#include "arrow/io/interfaces.h"
27#include "arrow/util/type_fwd.h"
28#include "arrow/util/visibility.h"
29
30namespace arrow {
31namespace io {
32
33struct ARROW_EXPORT CacheOptions {
34 static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
35 static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
36
37 /// \brief The maximum distance in bytes between two consecutive
38 /// ranges; beyond this value, ranges are not combined
39 int64_t hole_size_limit;
40 /// \brief The maximum size in bytes of a combined range; if
41 /// combining two consecutive ranges would produce a range of a
42 /// size greater than this, they are not combined
43 int64_t range_size_limit;
44 /// \brief A lazy cache does not perform any I/O until requested.
45 bool lazy;
46
47 bool operator==(const CacheOptions& other) const {
48 return hole_size_limit == other.hole_size_limit &&
49 range_size_limit == other.range_size_limit && lazy == other.lazy;
50 }
51
52 /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
53 ///
54 /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in
55 /// milliseconds, also called call setup latency of a new S3 request.
56 /// The value is a positive integer.
57 /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec.
58 /// The value is a positive integer.
59 /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction
60 /// (per connection) to maximize the net data load.
61 /// The value is a positive double precision number less than 1.
62 /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB)
63 /// to maximize the net data load.
64 /// The value is a positive integer.
65 /// \return A new instance of CacheOptions.
66 static CacheOptions MakeFromNetworkMetrics(
67 int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec,
68 double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac,
69 int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
70
71 static CacheOptions Defaults();
72 static CacheOptions LazyDefaults();
73};
74
75namespace internal {
76
77/// \brief A read cache designed to hide IO latencies when reading.
78///
79/// This class takes multiple byte ranges that an application expects to read, and
80/// coalesces them into fewer, larger read requests, which benefits performance on some
81/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
82/// these read requests in parallel up front.
83///
84/// To use:
85/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
86/// the exact offset and length that will later be read. The cache will combine those
87/// ranges according to parameters (see constructor).
88///
89/// By default, the cache will also start fetching the combined ranges in parallel in
90/// the background, unless CacheOptions.lazy is set.
91///
92/// 2. Call WaitFor() to be notified when the given ranges have been read. If
93/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
94/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
95/// chunk of the file that can be parsed in parallel).
96///
97/// 3. Call Read() to retrieve the actual data for the given ranges.
98/// A synchronous application may skip WaitFor() and just call Read() - it will still
99/// benefit from coalescing and parallel fetching.
100class ARROW_EXPORT ReadRangeCache {
101 public:
102 static constexpr int64_t kDefaultHoleSizeLimit = 8192;
103 static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
104
105 /// Construct a read cache with default
106 explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
107 : ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {}
108
109 /// Construct a read cache with given options
110 explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
111 CacheOptions options);
112 ~ReadRangeCache();
113
114 /// \brief Cache the given ranges in the background.
115 ///
116 /// The caller must ensure that the ranges do not overlap with each other,
117 /// nor with previously cached ranges. Otherwise, behaviour will be undefined.
118 Status Cache(std::vector<ReadRange> ranges);
119
120 /// \brief Read a range previously given to Cache().
121 Result<std::shared_ptr<Buffer>> Read(ReadRange range);
122
123 /// \brief Wait until all ranges added so far have been cached.
124 Future<> Wait();
125
126 /// \brief Wait until all given ranges have been cached.
127 Future<> WaitFor(std::vector<ReadRange> ranges);
128
129 protected:
130 struct Impl;
131 struct LazyImpl;
132
133 std::unique_ptr<Impl> impl_;
134};
135
136} // namespace internal
137} // namespace io
138} // namespace arrow