]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/table_cache.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / table_cache.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Thread-safe (provides internal synchronization)
11
12 #pragma once
13 #include <string>
14 #include <vector>
15 #include <stdint.h>
16
17 #include "db/dbformat.h"
18 #include "db/range_del_aggregator.h"
19 #include "options/cf_options.h"
20 #include "port/port.h"
21 #include "rocksdb/cache.h"
22 #include "rocksdb/env.h"
23 #include "rocksdb/options.h"
24 #include "rocksdb/table.h"
25 #include "table/table_reader.h"
26 #include "trace_replay/block_cache_tracer.h"
27
28 namespace ROCKSDB_NAMESPACE {
29
30 class Env;
31 class Arena;
32 struct FileDescriptor;
33 class GetContext;
34 class HistogramImpl;
35
36 // Manages caching for TableReader objects for a column family. The actual
37 // cache is allocated separately and passed to the constructor. TableCache
38 // wraps around the underlying SST file readers by providing Get(),
39 // MultiGet() and NewIterator() methods that hide the instantiation,
40 // caching and access to the TableReader. The main purpose of this is
41 // performance - by caching the TableReader, it avoids unnecessary file opens
42 // and object allocation and instantiation. One exception is compaction, where
43 // a new TableReader may be instantiated - see NewIterator() comments
44 //
45 // Another service provided by TableCache is managing the row cache - if the
46 // DB is configured with a row cache, and the lookup key is present in the row
47 // cache, lookup is very fast. The row cache is obtained from
48 // ioptions.row_cache
49 class TableCache {
50 public:
51 TableCache(const ImmutableCFOptions& ioptions,
52 const FileOptions& storage_options, Cache* cache,
53 BlockCacheTracer* const block_cache_tracer,
54 const std::shared_ptr<IOTracer>& io_tracer);
55 ~TableCache();
56
57 // Return an iterator for the specified file number (the corresponding
58 // file length must be exactly "file_size" bytes). If "table_reader_ptr"
59 // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
60 // underlying the returned iterator, or nullptr if no Table object underlies
61 // the returned iterator. The returned "*table_reader_ptr" object is owned
62 // by the cache and should not be deleted, and is valid for as long as the
63 // returned iterator is live.
64 // @param options Must outlive the returned iterator.
65 // @param range_del_agg If non-nullptr, adds range deletions to the
66 // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
67 // @param for_compaction If true, a new TableReader may be allocated (but
68 // not cached), depending on the CF options
69 // @param skip_filters Disables loading/accessing the filter block
70 // @param level The level this table is at, -1 for "not set / don't know"
71 InternalIterator* NewIterator(
72 const ReadOptions& options, const FileOptions& toptions,
73 const InternalKeyComparator& internal_comparator,
74 const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
75 const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
76 HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
77 bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin,
78 const InternalKey* smallest_compaction_key,
79 const InternalKey* largest_compaction_key, bool allow_unprepared_value);
80
81 // If a seek to internal key "k" in specified file finds an entry,
82 // call get_context->SaveValue() repeatedly until
83 // it returns false. As a side effect, it will insert the TableReader
84 // into the cache and potentially evict another entry
85 // @param get_context Context for get operation. The result of the lookup
86 // can be retrieved by calling get_context->State()
87 // @param file_read_hist If non-nullptr, the file reader statistics are
88 // recorded
89 // @param skip_filters Disables loading/accessing the filter block
90 // @param level The level this table is at, -1 for "not set / don't know"
91 Status Get(const ReadOptions& options,
92 const InternalKeyComparator& internal_comparator,
93 const FileMetaData& file_meta, const Slice& k,
94 GetContext* get_context,
95 const SliceTransform* prefix_extractor = nullptr,
96 HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
97 int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
98
99 // Return the range delete tombstone iterator of the file specified by
100 // `file_meta`.
101 Status GetRangeTombstoneIterator(
102 const ReadOptions& options,
103 const InternalKeyComparator& internal_comparator,
104 const FileMetaData& file_meta,
105 std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
106
107 // If a seek to internal key "k" in specified file finds an entry,
108 // call get_context->SaveValue() repeatedly until
109 // it returns false. As a side effect, it will insert the TableReader
110 // into the cache and potentially evict another entry
111 // @param mget_range Pointer to the structure describing a batch of keys to
112 // be looked up in this table file. The result is stored
113 // in the embedded GetContext
114 // @param skip_filters Disables loading/accessing the filter block
115 // @param level The level this table is at, -1 for "not set / don't know"
116 Status MultiGet(const ReadOptions& options,
117 const InternalKeyComparator& internal_comparator,
118 const FileMetaData& file_meta,
119 const MultiGetContext::Range* mget_range,
120 const SliceTransform* prefix_extractor = nullptr,
121 HistogramImpl* file_read_hist = nullptr,
122 bool skip_filters = false, int level = -1);
123
124 // Evict any entry for the specified file number
125 static void Evict(Cache* cache, uint64_t file_number);
126
127 // Clean table handle and erase it from the table cache
128 // Used in DB close, or the file is not live anymore.
129 void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
130
131 // Find table reader
132 // @param skip_filters Disables loading/accessing the filter block
133 // @param level == -1 means not specified
134 Status FindTable(const ReadOptions& ro, const FileOptions& toptions,
135 const InternalKeyComparator& internal_comparator,
136 const FileDescriptor& file_fd, Cache::Handle**,
137 const SliceTransform* prefix_extractor = nullptr,
138 const bool no_io = false, bool record_read_stats = true,
139 HistogramImpl* file_read_hist = nullptr,
140 bool skip_filters = false, int level = -1,
141 bool prefetch_index_and_filter_in_cache = true,
142 size_t max_file_size_for_l0_meta_pin = 0);
143
144 // Get TableReader from a cache handle.
145 TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
146
147 // Get the table properties of a given table.
148 // @no_io: indicates if we should load table to the cache if it is not present
149 // in table cache yet.
150 // @returns: `properties` will be reset on success. Please note that we will
151 // return Status::Incomplete() if table is not present in cache and
152 // we set `no_io` to be true.
153 Status GetTableProperties(const FileOptions& toptions,
154 const InternalKeyComparator& internal_comparator,
155 const FileDescriptor& file_meta,
156 std::shared_ptr<const TableProperties>* properties,
157 const SliceTransform* prefix_extractor = nullptr,
158 bool no_io = false);
159
160 // Return total memory usage of the table reader of the file.
161 // 0 if table reader of the file is not loaded.
162 size_t GetMemoryUsageByTableReader(
163 const FileOptions& toptions,
164 const InternalKeyComparator& internal_comparator,
165 const FileDescriptor& fd,
166 const SliceTransform* prefix_extractor = nullptr);
167
168 // Returns approximated offset of a key in a file represented by fd.
169 uint64_t ApproximateOffsetOf(
170 const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
171 const InternalKeyComparator& internal_comparator,
172 const SliceTransform* prefix_extractor = nullptr);
173
174 // Returns approximated data size between start and end keys in a file
175 // represented by fd (the start key must not be greater than the end key).
176 uint64_t ApproximateSize(const Slice& start, const Slice& end,
177 const FileDescriptor& fd, TableReaderCaller caller,
178 const InternalKeyComparator& internal_comparator,
179 const SliceTransform* prefix_extractor = nullptr);
180
181 // Release the handle from a cache
182 void ReleaseHandle(Cache::Handle* handle);
183
184 Cache* get_cache() const { return cache_; }
185
186 // Capacity of the backing Cache that indicates inifinite TableCache capacity.
187 // For example when max_open_files is -1 we set the backing Cache to this.
188 static const int kInfiniteCapacity = 0x400000;
189
190 // The tables opened with this TableCache will be immortal, i.e., their
191 // lifetime is as long as that of the DB.
192 void SetTablesAreImmortal() {
193 if (cache_->GetCapacity() >= kInfiniteCapacity) {
194 immortal_tables_ = true;
195 }
196 }
197
198 private:
199 // Build a table reader
200 Status GetTableReader(const ReadOptions& ro, const FileOptions& file_options,
201 const InternalKeyComparator& internal_comparator,
202 const FileDescriptor& fd, bool sequential_mode,
203 bool record_read_stats, HistogramImpl* file_read_hist,
204 std::unique_ptr<TableReader>* table_reader,
205 const SliceTransform* prefix_extractor = nullptr,
206 bool skip_filters = false, int level = -1,
207 bool prefetch_index_and_filter_in_cache = true,
208 size_t max_file_size_for_l0_meta_pin = 0);
209
210 // Create a key prefix for looking up the row cache. The prefix is of the
211 // format row_cache_id + fd_number + seq_no. Later, the user key can be
212 // appended to form the full key
213 void CreateRowCacheKeyPrefix(const ReadOptions& options,
214 const FileDescriptor& fd,
215 const Slice& internal_key,
216 GetContext* get_context, IterKey& row_cache_key);
217
218 // Helper function to lookup the row cache for a key. It appends the
219 // user key to row_cache_key at offset prefix_size
220 bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
221 size_t prefix_size, GetContext* get_context);
222
223 const ImmutableCFOptions& ioptions_;
224 const FileOptions& file_options_;
225 Cache* const cache_;
226 std::string row_cache_id_;
227 bool immortal_tables_;
228 BlockCacheTracer* const block_cache_tracer_;
229 Striped<port::Mutex, Slice> loader_mutex_;
230 std::shared_ptr<IOTracer> io_tracer_;
231 };
232
233 } // namespace ROCKSDB_NAMESPACE