1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // Thread-safe (provides internal synchronization)
17 #include "db/dbformat.h"
18 #include "db/range_del_aggregator.h"
19 #include "options/cf_options.h"
20 #include "port/port.h"
21 #include "rocksdb/cache.h"
22 #include "rocksdb/env.h"
23 #include "rocksdb/options.h"
24 #include "rocksdb/table.h"
25 #include "table/table_reader.h"
26 #include "trace_replay/block_cache_tracer.h"
28 namespace ROCKSDB_NAMESPACE
{
32 struct FileDescriptor
;
36 // Manages caching for TableReader objects for a column family. The actual
37 // cache is allocated separately and passed to the constructor. TableCache
38 // wraps around the underlying SST file readers by providing Get(),
39 // MultiGet() and NewIterator() methods that hide the instantiation,
40 // caching and access to the TableReader. The main purpose of this is
41 // performance - by caching the TableReader, it avoids unnecessary file opens
42 // and object allocation and instantiation. One exception is compaction, where
43 // a new TableReader may be instantiated - see NewIterator() comments
45 // Another service provided by TableCache is managing the row cache - if the
46 // DB is configured with a row cache, and the lookup key is present in the row
47 // cache, lookup is very fast. The row cache is obtained from
51 TableCache(const ImmutableCFOptions
& ioptions
,
52 const FileOptions
& storage_options
, Cache
* cache
,
53 BlockCacheTracer
* const block_cache_tracer
);
56 // Return an iterator for the specified file number (the corresponding
57 // file length must be exactly "file_size" bytes). If "table_reader_ptr"
58 // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
59 // underlying the returned iterator, or nullptr if no Table object underlies
60 // the returned iterator. The returned "*table_reader_ptr" object is owned
61 // by the cache and should not be deleted, and is valid for as long as the
62 // returned iterator is live.
63 // @param range_del_agg If non-nullptr, adds range deletions to the
64 // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
65 // @param for_compaction If true, a new TableReader may be allocated (but
66 // not cached), depending on the CF options
67 // @param skip_filters Disables loading/accessing the filter block
68 // @param level The level this table is at, -1 for "not set / don't know"
69 InternalIterator
* NewIterator(
70 const ReadOptions
& options
, const FileOptions
& toptions
,
71 const InternalKeyComparator
& internal_comparator
,
72 const FileMetaData
& file_meta
, RangeDelAggregator
* range_del_agg
,
73 const SliceTransform
* prefix_extractor
, TableReader
** table_reader_ptr
,
74 HistogramImpl
* file_read_hist
, TableReaderCaller caller
, Arena
* arena
,
75 bool skip_filters
, int level
, const InternalKey
* smallest_compaction_key
,
76 const InternalKey
* largest_compaction_key
);
78 // If a seek to internal key "k" in specified file finds an entry,
79 // call get_context->SaveValue() repeatedly until
80 // it returns false. As a side effect, it will insert the TableReader
81 // into the cache and potentially evict another entry
82 // @param get_context Context for get operation. The result of the lookup
83 // can be retrieved by calling get_context->State()
84 // @param file_read_hist If non-nullptr, the file reader statistics are
86 // @param skip_filters Disables loading/accessing the filter block
87 // @param level The level this table is at, -1 for "not set / don't know"
88 Status
Get(const ReadOptions
& options
,
89 const InternalKeyComparator
& internal_comparator
,
90 const FileMetaData
& file_meta
, const Slice
& k
,
91 GetContext
* get_context
,
92 const SliceTransform
* prefix_extractor
= nullptr,
93 HistogramImpl
* file_read_hist
= nullptr, bool skip_filters
= false,
96 // Return the range delete tombstone iterator of the file specified by
98 Status
GetRangeTombstoneIterator(
99 const ReadOptions
& options
,
100 const InternalKeyComparator
& internal_comparator
,
101 const FileMetaData
& file_meta
,
102 std::unique_ptr
<FragmentedRangeTombstoneIterator
>* out_iter
);
104 // If a seek to internal key "k" in specified file finds an entry,
105 // call get_context->SaveValue() repeatedly until
106 // it returns false. As a side effect, it will insert the TableReader
107 // into the cache and potentially evict another entry
108 // @param mget_range Pointer to the structure describing a batch of keys to
109 // be looked up in this table file. The result is stored
110 // in the embedded GetContext
111 // @param skip_filters Disables loading/accessing the filter block
112 // @param level The level this table is at, -1 for "not set / don't know"
113 Status
MultiGet(const ReadOptions
& options
,
114 const InternalKeyComparator
& internal_comparator
,
115 const FileMetaData
& file_meta
,
116 const MultiGetContext::Range
* mget_range
,
117 const SliceTransform
* prefix_extractor
= nullptr,
118 HistogramImpl
* file_read_hist
= nullptr,
119 bool skip_filters
= false, int level
= -1);
121 // Evict any entry for the specified file number
122 static void Evict(Cache
* cache
, uint64_t file_number
);
124 // Clean table handle and erase it from the table cache
125 // Used in DB close, or the file is not live anymore.
126 void EraseHandle(const FileDescriptor
& fd
, Cache::Handle
* handle
);
129 // @param skip_filters Disables loading/accessing the filter block
130 // @param level == -1 means not specified
131 Status
FindTable(const FileOptions
& toptions
,
132 const InternalKeyComparator
& internal_comparator
,
133 const FileDescriptor
& file_fd
, Cache::Handle
**,
134 const SliceTransform
* prefix_extractor
= nullptr,
135 const bool no_io
= false, bool record_read_stats
= true,
136 HistogramImpl
* file_read_hist
= nullptr,
137 bool skip_filters
= false, int level
= -1,
138 bool prefetch_index_and_filter_in_cache
= true);
140 // Get TableReader from a cache handle.
141 TableReader
* GetTableReaderFromHandle(Cache::Handle
* handle
);
143 // Get the table properties of a given table.
144 // @no_io: indicates if we should load table to the cache if it is not present
145 // in table cache yet.
146 // @returns: `properties` will be reset on success. Please note that we will
147 // return Status::Incomplete() if table is not present in cache and
148 // we set `no_io` to be true.
149 Status
GetTableProperties(const FileOptions
& toptions
,
150 const InternalKeyComparator
& internal_comparator
,
151 const FileDescriptor
& file_meta
,
152 std::shared_ptr
<const TableProperties
>* properties
,
153 const SliceTransform
* prefix_extractor
= nullptr,
156 // Return total memory usage of the table reader of the file.
157 // 0 if table reader of the file is not loaded.
158 size_t GetMemoryUsageByTableReader(
159 const FileOptions
& toptions
,
160 const InternalKeyComparator
& internal_comparator
,
161 const FileDescriptor
& fd
,
162 const SliceTransform
* prefix_extractor
= nullptr);
164 // Returns approximated offset of a key in a file represented by fd.
165 uint64_t ApproximateOffsetOf(
166 const Slice
& key
, const FileDescriptor
& fd
, TableReaderCaller caller
,
167 const InternalKeyComparator
& internal_comparator
,
168 const SliceTransform
* prefix_extractor
= nullptr);
170 // Returns approximated data size between start and end keys in a file
171 // represented by fd (the start key must not be greater than the end key).
172 uint64_t ApproximateSize(const Slice
& start
, const Slice
& end
,
173 const FileDescriptor
& fd
, TableReaderCaller caller
,
174 const InternalKeyComparator
& internal_comparator
,
175 const SliceTransform
* prefix_extractor
= nullptr);
177 // Release the handle from a cache
178 void ReleaseHandle(Cache::Handle
* handle
);
180 Cache
* get_cache() const { return cache_
; }
182 // Capacity of the backing Cache that indicates inifinite TableCache capacity.
183 // For example when max_open_files is -1 we set the backing Cache to this.
184 static const int kInfiniteCapacity
= 0x400000;
186 // The tables opened with this TableCache will be immortal, i.e., their
187 // lifetime is as long as that of the DB.
188 void SetTablesAreImmortal() {
189 if (cache_
->GetCapacity() >= kInfiniteCapacity
) {
190 immortal_tables_
= true;
195 // Build a table reader
196 Status
GetTableReader(const FileOptions
& file_options
,
197 const InternalKeyComparator
& internal_comparator
,
198 const FileDescriptor
& fd
, bool sequential_mode
,
199 bool record_read_stats
, HistogramImpl
* file_read_hist
,
200 std::unique_ptr
<TableReader
>* table_reader
,
201 const SliceTransform
* prefix_extractor
= nullptr,
202 bool skip_filters
= false, int level
= -1,
203 bool prefetch_index_and_filter_in_cache
= true);
205 // Create a key prefix for looking up the row cache. The prefix is of the
206 // format row_cache_id + fd_number + seq_no. Later, the user key can be
207 // appended to form the full key
208 void CreateRowCacheKeyPrefix(const ReadOptions
& options
,
209 const FileDescriptor
& fd
,
210 const Slice
& internal_key
,
211 GetContext
* get_context
, IterKey
& row_cache_key
);
213 // Helper function to lookup the row cache for a key. It appends the
214 // user key to row_cache_key at offset prefix_size
215 bool GetFromRowCache(const Slice
& user_key
, IterKey
& row_cache_key
,
216 size_t prefix_size
, GetContext
* get_context
);
218 const ImmutableCFOptions
& ioptions_
;
219 const FileOptions
& file_options_
;
221 std::string row_cache_id_
;
222 bool immortal_tables_
;
223 BlockCacheTracer
* const block_cache_tracer_
;
226 } // namespace ROCKSDB_NAMESPACE