]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
12 | #include <stdint.h> | |
13 | #include <memory> | |
14 | #include <set> | |
15 | #include <string> | |
16 | #include <utility> | |
17 | #include <vector> | |
18 | ||
494da23a | 19 | #include "db/range_tombstone_fragmenter.h" |
7c673cae FG |
20 | #include "options/cf_options.h" |
21 | #include "rocksdb/options.h" | |
22 | #include "rocksdb/persistent_cache.h" | |
23 | #include "rocksdb/statistics.h" | |
24 | #include "rocksdb/status.h" | |
25 | #include "rocksdb/table.h" | |
11fdf7f2 TL |
26 | #include "table/block.h" |
27 | #include "table/block_based_table_factory.h" | |
7c673cae FG |
28 | #include "table/filter_block.h" |
29 | #include "table/format.h" | |
30 | #include "table/persistent_cache_helper.h" | |
31 | #include "table/table_properties_internal.h" | |
32 | #include "table/table_reader.h" | |
33 | #include "table/two_level_iterator.h" | |
34 | #include "util/coding.h" | |
35 | #include "util/file_reader_writer.h" | |
494da23a | 36 | #include "util/user_comparator_wrapper.h" |
7c673cae FG |
37 | |
38 | namespace rocksdb { | |
39 | ||
7c673cae FG |
40 | class BlockHandle; |
41 | class Cache; | |
42 | class FilterBlockReader; | |
43 | class BlockBasedFilterBlockReader; | |
44 | class FullFilterBlockReader; | |
45 | class Footer; | |
46 | class InternalKeyComparator; | |
47 | class Iterator; | |
48 | class RandomAccessFile; | |
49 | class TableCache; | |
50 | class TableReader; | |
51 | class WritableFile; | |
52 | struct BlockBasedTableOptions; | |
53 | struct EnvOptions; | |
54 | struct ReadOptions; | |
55 | class GetContext; | |
7c673cae | 56 | |
7c673cae FG |
57 | typedef std::vector<std::pair<std::string, std::string>> KVPairBlock; |
58 | ||
59 | // A Table is a sorted map from strings to strings. Tables are | |
60 | // immutable and persistent. A Table may be safely accessed from | |
61 | // multiple threads without external synchronization. | |
62 | class BlockBasedTable : public TableReader { | |
63 | public: | |
64 | static const std::string kFilterBlockPrefix; | |
65 | static const std::string kFullFilterBlockPrefix; | |
66 | static const std::string kPartitionedFilterBlockPrefix; | |
67 | // The longest prefix of the cache key used to identify blocks. | |
68 | // For Posix files the unique ID is three varints. | |
69 | static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; | |
70 | ||
71 | // Attempt to open the table that is stored in bytes [0..file_size) | |
72 | // of "file", and read the metadata entries necessary to allow | |
73 | // retrieving data from the table. | |
74 | // | |
75 | // If successful, returns ok and sets "*table_reader" to the newly opened | |
76 | // table. The client should delete "*table_reader" when no longer needed. | |
77 | // If there was an error while initializing the table, sets "*table_reader" | |
78 | // to nullptr and returns a non-ok status. | |
79 | // | |
80 | // @param file must remain live while this Table is in use. | |
81 | // @param prefetch_index_and_filter_in_cache can be used to disable | |
82 | // prefetching of | |
83 | // index and filter blocks into block cache at startup | |
84 | // @param skip_filters Disables loading/accessing the filter block. Overrides | |
85 | // prefetch_index_and_filter_in_cache, so filter will be skipped if both | |
86 | // are set. | |
87 | static Status Open(const ImmutableCFOptions& ioptions, | |
88 | const EnvOptions& env_options, | |
89 | const BlockBasedTableOptions& table_options, | |
90 | const InternalKeyComparator& internal_key_comparator, | |
494da23a TL |
91 | std::unique_ptr<RandomAccessFileReader>&& file, |
92 | uint64_t file_size, | |
93 | std::unique_ptr<TableReader>* table_reader, | |
11fdf7f2 | 94 | const SliceTransform* prefix_extractor = nullptr, |
7c673cae | 95 | bool prefetch_index_and_filter_in_cache = true, |
11fdf7f2 TL |
96 | bool skip_filters = false, int level = -1, |
97 | const bool immortal_table = false, | |
98 | const SequenceNumber largest_seqno = 0, | |
99 | TailPrefetchStats* tail_prefetch_stats = nullptr); | |
7c673cae | 100 | |
11fdf7f2 TL |
101 | bool PrefixMayMatch(const Slice& internal_key, |
102 | const ReadOptions& read_options, | |
103 | const SliceTransform* options_prefix_extractor, | |
104 | const bool need_upper_bound_check); | |
7c673cae FG |
105 | |
106 | // Returns a new iterator over the table contents. | |
107 | // The result of NewIterator() is initially invalid (caller must | |
108 | // call one of the Seek methods on the iterator before using it). | |
109 | // @param skip_filters Disables loading/accessing the filter block | |
11fdf7f2 TL |
110 | InternalIterator* NewIterator(const ReadOptions&, |
111 | const SliceTransform* prefix_extractor, | |
112 | Arena* arena = nullptr, | |
113 | bool skip_filters = false, | |
114 | bool for_compaction = false) override; | |
7c673cae | 115 | |
494da23a | 116 | FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( |
7c673cae FG |
117 | const ReadOptions& read_options) override; |
118 | ||
119 | // @param skip_filters Disables loading/accessing the filter block | |
120 | Status Get(const ReadOptions& readOptions, const Slice& key, | |
11fdf7f2 TL |
121 | GetContext* get_context, const SliceTransform* prefix_extractor, |
122 | bool skip_filters = false) override; | |
7c673cae FG |
123 | |
124 | // Pre-fetch the disk blocks that correspond to the key range specified by | |
125 | // (kbegin, kend). The call will return error status in the event of | |
126 | // IO or iteration error. | |
127 | Status Prefetch(const Slice* begin, const Slice* end) override; | |
128 | ||
129 | // Given a key, return an approximate byte offset in the file where | |
130 | // the data for that key begins (or would begin if the key were | |
131 | // present in the file). The returned value is in terms of file | |
132 | // bytes, and so includes effects like compression of the underlying data. | |
133 | // E.g., the approximate offset of the last key in the table will | |
134 | // be close to the file length. | |
135 | uint64_t ApproximateOffsetOf(const Slice& key) override; | |
136 | ||
137 | // Returns true if the block for the specified key is in cache. | |
138 | // REQUIRES: key is in this table && block cache enabled | |
139 | bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); | |
140 | ||
141 | // Set up the table for Compaction. Might change some parameters with | |
142 | // posix_fadvise | |
143 | void SetupForCompaction() override; | |
144 | ||
145 | std::shared_ptr<const TableProperties> GetTableProperties() const override; | |
146 | ||
147 | size_t ApproximateMemoryUsage() const override; | |
148 | ||
149 | // convert SST file to a human readable form | |
11fdf7f2 TL |
150 | Status DumpTable(WritableFile* out_file, |
151 | const SliceTransform* prefix_extractor = nullptr) override; | |
152 | ||
153 | Status VerifyChecksum() override; | |
7c673cae FG |
154 | |
155 | void Close() override; | |
156 | ||
157 | ~BlockBasedTable(); | |
158 | ||
159 | bool TEST_filter_block_preloaded() const; | |
160 | bool TEST_index_reader_preloaded() const; | |
161 | ||
162 | // IndexReader is the interface that provide the functionality for index | |
163 | // access. | |
164 | class IndexReader { | |
165 | public: | |
11fdf7f2 TL |
166 | explicit IndexReader(const InternalKeyComparator* icomparator, |
167 | Statistics* stats) | |
168 | : icomparator_(icomparator), statistics_(stats) {} | |
7c673cae FG |
169 | |
170 | virtual ~IndexReader() {} | |
171 | ||
172 | // Create an iterator for index access. | |
173 | // If iter is null then a new object is created on heap and the callee will | |
174 | // have the ownership. If a non-null iter is passed in it will be used, and | |
175 | // the returned value is either the same as iter or a new on-heap object | |
176 | // that | |
177 | // wrapps the passed iter. In the latter case the return value would point | |
178 | // to | |
179 | // a different object then iter and the callee has the ownership of the | |
180 | // returned object. | |
11fdf7f2 TL |
181 | virtual InternalIteratorBase<BlockHandle>* NewIterator( |
182 | IndexBlockIter* iter = nullptr, bool total_order_seek = true, | |
183 | bool fill_cache = true) = 0; | |
7c673cae FG |
184 | |
185 | // The size of the index. | |
186 | virtual size_t size() const = 0; | |
187 | // Memory usage of the index block | |
188 | virtual size_t usable_size() const = 0; | |
189 | // return the statistics pointer | |
190 | virtual Statistics* statistics() const { return statistics_; } | |
191 | // Report an approximation of how much memory has been used other than | |
192 | // memory | |
193 | // that was allocated in block cache. | |
194 | virtual size_t ApproximateMemoryUsage() const = 0; | |
195 | ||
11fdf7f2 TL |
196 | virtual void CacheDependencies(bool /* unused */) {} |
197 | ||
198 | // Prefetch all the blocks referenced by this index to the buffer | |
199 | void PrefetchBlocks(FilePrefetchBuffer* buf); | |
200 | ||
7c673cae | 201 | protected: |
11fdf7f2 | 202 | const InternalKeyComparator* icomparator_; |
7c673cae FG |
203 | |
204 | private: | |
205 | Statistics* statistics_; | |
206 | }; | |
207 | ||
208 | static Slice GetCacheKey(const char* cache_key_prefix, | |
209 | size_t cache_key_prefix_size, | |
210 | const BlockHandle& handle, char* cache_key); | |
211 | ||
212 | // Retrieve all key value pairs from data blocks in the table. | |
213 | // The key retrieved are internal keys. | |
214 | Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); | |
215 | ||
11fdf7f2 TL |
216 | template <class TValue> |
217 | struct CachableEntry; | |
218 | struct Rep; | |
219 | ||
220 | Rep* get_rep() { return rep_; } | |
221 | ||
222 | // input_iter: if it is not null, update this one and return it as Iterator | |
223 | template <typename TBlockIter> | |
224 | static TBlockIter* NewDataBlockIterator( | |
225 | Rep* rep, const ReadOptions& ro, const Slice& index_value, | |
226 | TBlockIter* input_iter = nullptr, bool is_index = false, | |
227 | bool key_includes_seq = true, bool index_key_is_full = true, | |
228 | GetContext* get_context = nullptr, | |
229 | FilePrefetchBuffer* prefetch_buffer = nullptr); | |
230 | template <typename TBlockIter> | |
231 | static TBlockIter* NewDataBlockIterator( | |
232 | Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, | |
233 | TBlockIter* input_iter = nullptr, bool is_index = false, | |
234 | bool key_includes_seq = true, bool index_key_is_full = true, | |
235 | GetContext* get_context = nullptr, Status s = Status(), | |
236 | FilePrefetchBuffer* prefetch_buffer = nullptr); | |
237 | ||
238 | class PartitionedIndexIteratorState; | |
7c673cae FG |
239 | |
240 | friend class PartitionIndexReader; | |
241 | ||
242 | protected: | |
7c673cae | 243 | Rep* rep_; |
11fdf7f2 | 244 | explicit BlockBasedTable(Rep* rep) : rep_(rep) {} |
7c673cae FG |
245 | |
246 | private: | |
11fdf7f2 TL |
247 | friend class MockedBlockBasedTable; |
248 | static std::atomic<uint64_t> next_cache_key_id_; | |
7c673cae | 249 | |
7c673cae FG |
250 | // If block cache enabled (compressed or uncompressed), looks for the block |
251 | // identified by handle in (1) uncompressed cache, (2) compressed cache, and | |
252 | // then (3) file. If found, inserts into the cache(s) that were searched | |
253 | // unsuccessfully (e.g., if found in file, will add to both uncompressed and | |
254 | // compressed caches if they're enabled). | |
255 | // | |
256 | // @param block_entry value is set to the uncompressed block if found. If | |
257 | // in uncompressed block cache, also sets cache_handle to reference that | |
258 | // block. | |
494da23a TL |
259 | static Status MaybeReadBlockAndLoadToCache( |
260 | FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, | |
261 | const BlockHandle& handle, const UncompressionDict& uncompression_dict, | |
262 | CachableEntry<Block>* block_entry, bool is_index = false, | |
263 | GetContext* get_context = nullptr); | |
7c673cae FG |
264 | |
265 | // For the following two functions: | |
266 | // if `no_io == true`, we will not try to read filter/index from sst file | |
267 | // were they not present in cache yet. | |
11fdf7f2 TL |
268 | CachableEntry<FilterBlockReader> GetFilter( |
269 | const SliceTransform* prefix_extractor = nullptr, | |
270 | FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false, | |
271 | GetContext* get_context = nullptr) const; | |
7c673cae | 272 | virtual CachableEntry<FilterBlockReader> GetFilter( |
11fdf7f2 TL |
273 | FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, |
274 | const bool is_a_filter_partition, bool no_io, GetContext* get_context, | |
275 | const SliceTransform* prefix_extractor = nullptr) const; | |
7c673cae | 276 | |
494da23a TL |
277 | static CachableEntry<UncompressionDict> GetUncompressionDict( |
278 | Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, | |
279 | GetContext* get_context); | |
280 | ||
7c673cae FG |
281 | // Get the iterator from the index reader. |
282 | // If input_iter is not set, return new Iterator | |
283 | // If input_iter is set, update it and return it as Iterator | |
284 | // | |
285 | // Note: ErrorIterator with Status::Incomplete shall be returned if all the | |
286 | // following conditions are met: | |
287 | // 1. We enabled table_options.cache_index_and_filter_blocks. | |
288 | // 2. index is not present in block cache. | |
289 | // 3. We disallowed any io to be performed, that is, read_options == | |
290 | // kBlockCacheTier | |
11fdf7f2 TL |
291 | InternalIteratorBase<BlockHandle>* NewIndexIterator( |
292 | const ReadOptions& read_options, bool need_upper_bound_check = false, | |
293 | IndexBlockIter* input_iter = nullptr, | |
294 | CachableEntry<IndexReader>* index_entry = nullptr, | |
295 | GetContext* get_context = nullptr); | |
7c673cae FG |
296 | |
297 | // Read block cache from block caches (if set): block_cache and | |
298 | // block_cache_compressed. | |
299 | // On success, Status::OK with be returned and @block will be populated with | |
300 | // pointer to the block as well as its block handle. | |
494da23a | 301 | // @param uncompression_dict Data for presetting the compression library's |
7c673cae FG |
302 | // dictionary. |
303 | static Status GetDataBlockFromCache( | |
304 | const Slice& block_cache_key, const Slice& compressed_block_cache_key, | |
494da23a TL |
305 | Cache* block_cache, Cache* block_cache_compressed, Rep* rep, |
306 | const ReadOptions& read_options, | |
307 | BlockBasedTable::CachableEntry<Block>* block, | |
308 | const UncompressionDict& uncompression_dict, | |
309 | size_t read_amp_bytes_per_bit, bool is_index = false, | |
310 | GetContext* get_context = nullptr); | |
7c673cae FG |
311 | |
312 | // Put a raw block (maybe compressed) to the corresponding block caches. | |
313 | // This method will perform decompression against raw_block if needed and then | |
314 | // populate the block caches. | |
315 | // On success, Status::OK will be returned; also @block will be populated with | |
316 | // uncompressed block and its cache handle. | |
317 | // | |
494da23a TL |
318 | // Allocated memory managed by raw_block_contents will be transferred to |
319 | // PutDataBlockToCache(). After the call, the object will be invalid. | |
320 | // @param uncompression_dict Data for presetting the compression library's | |
7c673cae FG |
321 | // dictionary. |
322 | static Status PutDataBlockToCache( | |
323 | const Slice& block_cache_key, const Slice& compressed_block_cache_key, | |
324 | Cache* block_cache, Cache* block_cache_compressed, | |
325 | const ReadOptions& read_options, const ImmutableCFOptions& ioptions, | |
494da23a TL |
326 | CachableEntry<Block>* block, BlockContents* raw_block_contents, |
327 | CompressionType raw_block_comp_type, uint32_t format_version, | |
328 | const UncompressionDict& uncompression_dict, SequenceNumber seq_no, | |
329 | size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, | |
11fdf7f2 TL |
330 | bool is_index = false, Cache::Priority pri = Cache::Priority::LOW, |
331 | GetContext* get_context = nullptr); | |
7c673cae FG |
332 | |
333 | // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found | |
334 | // after a call to Seek(key), until handle_result returns false. | |
335 | // May not make such a call if filter policy says that key is not present. | |
336 | friend class TableCache; | |
337 | friend class BlockBasedTableBuilder; | |
338 | ||
339 | void ReadMeta(const Footer& footer); | |
340 | ||
11fdf7f2 TL |
341 | // Figure the index type, update it in rep_, and also return it. |
342 | BlockBasedTableOptions::IndexType UpdateIndexType(); | |
343 | ||
7c673cae FG |
344 | // Create a index reader based on the index type stored in the table. |
345 | // Optionally, user can pass a preloaded meta_index_iter for the index that | |
346 | // need to access extra meta blocks for index construction. This parameter | |
347 | // helps avoid re-reading meta index block if caller already created one. | |
348 | Status CreateIndexReader( | |
11fdf7f2 | 349 | FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, |
7c673cae FG |
350 | InternalIterator* preloaded_meta_index_iter = nullptr, |
351 | const int level = -1); | |
352 | ||
11fdf7f2 TL |
353 | bool FullFilterKeyMayMatch( |
354 | const ReadOptions& read_options, FilterBlockReader* filter, | |
355 | const Slice& user_key, const bool no_io, | |
356 | const SliceTransform* prefix_extractor = nullptr) const; | |
7c673cae | 357 | |
494da23a TL |
358 | static Status PrefetchTail( |
359 | RandomAccessFileReader* file, uint64_t file_size, | |
360 | TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, | |
361 | const bool preload_all, | |
362 | std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer); | |
11fdf7f2 TL |
363 | static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer, |
364 | std::unique_ptr<Block>* meta_block, | |
7c673cae | 365 | std::unique_ptr<InternalIterator>* iter); |
494da23a TL |
366 | static Status TryReadPropertiesWithGlobalSeqno( |
367 | Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, | |
368 | TableProperties** table_properties); | |
369 | static Status ReadPropertiesBlock(Rep* rep, | |
370 | FilePrefetchBuffer* prefetch_buffer, | |
371 | InternalIterator* meta_iter, | |
372 | const SequenceNumber largest_seqno); | |
373 | static Status ReadRangeDelBlock( | |
374 | Rep* rep, FilePrefetchBuffer* prefetch_buffer, | |
375 | InternalIterator* meta_iter, | |
376 | const InternalKeyComparator& internal_comparator); | |
377 | static Status ReadCompressionDictBlock( | |
378 | Rep* rep, FilePrefetchBuffer* prefetch_buffer, | |
379 | std::unique_ptr<const BlockContents>* compression_dict_block); | |
380 | static Status PrefetchIndexAndFilterBlocks( | |
381 | Rep* rep, FilePrefetchBuffer* prefetch_buffer, | |
382 | InternalIterator* meta_iter, BlockBasedTable* new_table, | |
383 | const SliceTransform* prefix_extractor, bool prefetch_all, | |
384 | const BlockBasedTableOptions& table_options, const int level, | |
385 | const bool prefetch_index_and_filter_in_cache); | |
386 | ||
387 | Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); | |
11fdf7f2 TL |
388 | Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter); |
389 | ||
7c673cae | 390 | // Create the filter from the filter block. |
11fdf7f2 TL |
391 | virtual FilterBlockReader* ReadFilter( |
392 | FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, | |
393 | const bool is_a_filter_partition, | |
394 | const SliceTransform* prefix_extractor = nullptr) const; | |
7c673cae FG |
395 | |
396 | static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); | |
397 | ||
398 | // Generate a cache key prefix from the file | |
494da23a TL |
399 | static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file, |
400 | char* buffer, size_t* size); | |
401 | static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer, | |
402 | size_t* size); | |
7c673cae FG |
403 | |
404 | // Helper functions for DumpTable() | |
405 | Status DumpIndexBlock(WritableFile* out_file); | |
406 | Status DumpDataBlocks(WritableFile* out_file); | |
407 | void DumpKeyValue(const Slice& key, const Slice& value, | |
408 | WritableFile* out_file); | |
409 | ||
410 | // No copying allowed | |
411 | explicit BlockBasedTable(const TableReader&) = delete; | |
412 | void operator=(const TableReader&) = delete; | |
413 | ||
414 | friend class PartitionedFilterBlockReader; | |
415 | friend class PartitionedFilterBlockTest; | |
416 | }; | |
417 | ||
418 | // Maitaning state of a two-level iteration on a partitioned index structure | |
11fdf7f2 TL |
419 | class BlockBasedTable::PartitionedIndexIteratorState |
420 | : public TwoLevelIteratorState { | |
7c673cae | 421 | public: |
11fdf7f2 TL |
422 | PartitionedIndexIteratorState( |
423 | BlockBasedTable* table, | |
424 | std::unordered_map<uint64_t, CachableEntry<Block>>* block_map, | |
425 | const bool index_key_includes_seq, const bool index_key_is_full); | |
426 | InternalIteratorBase<BlockHandle>* NewSecondaryIterator( | |
427 | const BlockHandle& index_value) override; | |
7c673cae FG |
428 | |
429 | private: | |
430 | // Don't own table_ | |
431 | BlockBasedTable* table_; | |
11fdf7f2 TL |
432 | std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; |
433 | bool index_key_includes_seq_; | |
434 | bool index_key_is_full_; | |
7c673cae FG |
435 | }; |
436 | ||
437 | // CachableEntry represents the entries that *may* be fetched from block cache. | |
438 | // field `value` is the item we want to get. | |
439 | // field `cache_handle` is the cache handle to the block cache. If the value | |
440 | // was not read from cache, `cache_handle` will be nullptr. | |
441 | template <class TValue> | |
442 | struct BlockBasedTable::CachableEntry { | |
443 | CachableEntry(TValue* _value, Cache::Handle* _cache_handle) | |
444 | : value(_value), cache_handle(_cache_handle) {} | |
445 | CachableEntry() : CachableEntry(nullptr, nullptr) {} | |
11fdf7f2 | 446 | void Release(Cache* cache, bool force_erase = false) { |
7c673cae | 447 | if (cache_handle) { |
11fdf7f2 | 448 | cache->Release(cache_handle, force_erase); |
7c673cae FG |
449 | value = nullptr; |
450 | cache_handle = nullptr; | |
451 | } | |
452 | } | |
453 | bool IsSet() const { return cache_handle != nullptr; } | |
454 | ||
455 | TValue* value = nullptr; | |
456 | // if the entry is from the cache, cache_handle will be populated. | |
457 | Cache::Handle* cache_handle = nullptr; | |
458 | }; | |
459 | ||
460 | struct BlockBasedTable::Rep { | |
461 | Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, | |
462 | const BlockBasedTableOptions& _table_opt, | |
11fdf7f2 | 463 | const InternalKeyComparator& _internal_comparator, bool skip_filters, |
494da23a | 464 | int _level, const bool _immortal_table) |
7c673cae FG |
465 | : ioptions(_ioptions), |
466 | env_options(_env_options), | |
467 | table_options(_table_opt), | |
468 | filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), | |
469 | internal_comparator(_internal_comparator), | |
470 | filter_type(FilterType::kNoFilter), | |
11fdf7f2 TL |
471 | index_type(BlockBasedTableOptions::IndexType::kBinarySearch), |
472 | hash_index_allow_collision(false), | |
7c673cae FG |
473 | whole_key_filtering(_table_opt.whole_key_filtering), |
474 | prefix_filtering(true), | |
11fdf7f2 | 475 | global_seqno(kDisableGlobalSequenceNumber), |
494da23a | 476 | level(_level), |
11fdf7f2 | 477 | immortal_table(_immortal_table) {} |
7c673cae FG |
478 | |
479 | const ImmutableCFOptions& ioptions; | |
480 | const EnvOptions& env_options; | |
11fdf7f2 | 481 | const BlockBasedTableOptions table_options; |
7c673cae FG |
482 | const FilterPolicy* const filter_policy; |
483 | const InternalKeyComparator& internal_comparator; | |
484 | Status status; | |
494da23a | 485 | std::unique_ptr<RandomAccessFileReader> file; |
7c673cae FG |
486 | char cache_key_prefix[kMaxCacheKeyPrefixSize]; |
487 | size_t cache_key_prefix_size = 0; | |
488 | char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; | |
489 | size_t persistent_cache_key_prefix_size = 0; | |
490 | char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; | |
491 | size_t compressed_cache_key_prefix_size = 0; | |
492 | uint64_t dummy_index_reader_offset = | |
493 | 0; // ID that is unique for the block cache. | |
494 | PersistentCacheOptions persistent_cache_options; | |
495 | ||
496 | // Footer contains the fixed table information | |
497 | Footer footer; | |
494da23a TL |
498 | // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e., |
499 | // non-nullptr) and used only when options.block_cache is nullptr or when | |
500 | // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index, | |
501 | // filter, and compression dictionary blocks via the block cache. In that case | |
502 | // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle` | |
503 | // are used to lookup these meta-blocks in block cache. | |
504 | std::unique_ptr<IndexReader> index_reader; | |
505 | std::unique_ptr<FilterBlockReader> filter; | |
506 | std::unique_ptr<UncompressionDict> uncompression_dict; | |
7c673cae FG |
507 | |
508 | enum class FilterType { | |
509 | kNoFilter, | |
510 | kFullFilter, | |
511 | kBlockFilter, | |
512 | kPartitionedFilter, | |
513 | }; | |
514 | FilterType filter_type; | |
515 | BlockHandle filter_handle; | |
494da23a | 516 | BlockHandle compression_dict_handle; |
7c673cae FG |
517 | |
518 | std::shared_ptr<const TableProperties> table_properties; | |
7c673cae FG |
519 | BlockBasedTableOptions::IndexType index_type; |
520 | bool hash_index_allow_collision; | |
521 | bool whole_key_filtering; | |
522 | bool prefix_filtering; | |
523 | // TODO(kailiu) It is very ugly to use internal key in table, since table | |
524 | // module should not be relying on db module. However to make things easier | |
525 | // and compatible with existing code, we introduce a wrapper that allows | |
526 | // block to extract prefix without knowing if a key is internal or not. | |
494da23a | 527 | std::unique_ptr<SliceTransform> internal_prefix_transform; |
11fdf7f2 TL |
528 | std::shared_ptr<const SliceTransform> table_prefix_extractor; |
529 | ||
530 | // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is | |
531 | // true or in all levels when pin_top_level_index_and_filter is set in | |
532 | // combination with partitioned index/filters: then we do use the LRU cache, | |
533 | // but we always keep the filter & index block's handle checked out here (=we | |
534 | // don't call Release()), plus the parsed out objects the LRU cache will never | |
535 | // push flush them out, hence they're pinned | |
7c673cae FG |
536 | CachableEntry<FilterBlockReader> filter_entry; |
537 | CachableEntry<IndexReader> index_entry; | |
494da23a | 538 | std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; |
7c673cae FG |
539 | |
540 | // If global_seqno is used, all Keys in this file will have the same | |
541 | // seqno with value `global_seqno`. | |
542 | // | |
543 | // A value of kDisableGlobalSequenceNumber means that this feature is disabled | |
544 | // and every key have it's own seqno. | |
545 | SequenceNumber global_seqno; | |
11fdf7f2 | 546 | |
494da23a TL |
547 | // the level when the table is opened, could potentially change when trivial |
548 | // move is involved | |
549 | int level; | |
550 | ||
11fdf7f2 TL |
551 | // If false, blocks in this file are definitely all uncompressed. Knowing this |
552 | // before reading individual blocks enables certain optimizations. | |
553 | bool blocks_maybe_compressed = true; | |
554 | ||
494da23a TL |
555 | // If true, data blocks in this file are definitely ZSTD compressed. If false |
556 | // they might not be. When false we skip creating a ZSTD digested | |
557 | // uncompression dictionary. Even if we get a false negative, things should | |
558 | // still work, just not as quickly. | |
559 | bool blocks_definitely_zstd_compressed = false; | |
560 | ||
11fdf7f2 TL |
561 | bool closed = false; |
562 | const bool immortal_table; | |
494da23a TL |
563 | |
564 | SequenceNumber get_global_seqno(bool is_index) const { | |
565 | return is_index ? kDisableGlobalSequenceNumber : global_seqno; | |
566 | } | |
11fdf7f2 TL |
567 | }; |
568 | ||
569 | template <class TBlockIter, typename TValue = Slice> | |
570 | class BlockBasedTableIterator : public InternalIteratorBase<TValue> { | |
571 | public: | |
572 | BlockBasedTableIterator(BlockBasedTable* table, | |
573 | const ReadOptions& read_options, | |
574 | const InternalKeyComparator& icomp, | |
575 | InternalIteratorBase<BlockHandle>* index_iter, | |
576 | bool check_filter, bool need_upper_bound_check, | |
577 | const SliceTransform* prefix_extractor, bool is_index, | |
578 | bool key_includes_seq = true, | |
579 | bool index_key_is_full = true, | |
580 | bool for_compaction = false) | |
581 | : table_(table), | |
582 | read_options_(read_options), | |
583 | icomp_(icomp), | |
494da23a | 584 | user_comparator_(icomp.user_comparator()), |
11fdf7f2 TL |
585 | index_iter_(index_iter), |
586 | pinned_iters_mgr_(nullptr), | |
587 | block_iter_points_to_real_block_(false), | |
588 | check_filter_(check_filter), | |
589 | need_upper_bound_check_(need_upper_bound_check), | |
590 | prefix_extractor_(prefix_extractor), | |
591 | is_index_(is_index), | |
592 | key_includes_seq_(key_includes_seq), | |
593 | index_key_is_full_(index_key_is_full), | |
594 | for_compaction_(for_compaction) {} | |
595 | ||
596 | ~BlockBasedTableIterator() { delete index_iter_; } | |
597 | ||
598 | void Seek(const Slice& target) override; | |
599 | void SeekForPrev(const Slice& target) override; | |
600 | void SeekToFirst() override; | |
601 | void SeekToLast() override; | |
602 | void Next() override; | |
603 | void Prev() override; | |
604 | bool Valid() const override { | |
605 | return !is_out_of_bound_ && block_iter_points_to_real_block_ && | |
606 | block_iter_.Valid(); | |
607 | } | |
608 | Slice key() const override { | |
609 | assert(Valid()); | |
610 | return block_iter_.key(); | |
611 | } | |
612 | TValue value() const override { | |
613 | assert(Valid()); | |
614 | return block_iter_.value(); | |
615 | } | |
616 | Status status() const override { | |
617 | if (!index_iter_->status().ok()) { | |
618 | return index_iter_->status(); | |
619 | } else if (block_iter_points_to_real_block_) { | |
620 | return block_iter_.status(); | |
621 | } else { | |
622 | return Status::OK(); | |
623 | } | |
624 | } | |
625 | ||
626 | bool IsOutOfBound() override { return is_out_of_bound_; } | |
627 | ||
628 | void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { | |
629 | pinned_iters_mgr_ = pinned_iters_mgr; | |
630 | } | |
631 | bool IsKeyPinned() const override { | |
632 | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && | |
633 | block_iter_points_to_real_block_ && block_iter_.IsKeyPinned(); | |
634 | } | |
635 | bool IsValuePinned() const override { | |
636 | // BlockIter::IsValuePinned() is always true. No need to check | |
637 | return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && | |
638 | block_iter_points_to_real_block_; | |
639 | } | |
640 | ||
641 | bool CheckPrefixMayMatch(const Slice& ikey) { | |
642 | if (check_filter_ && | |
643 | !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, | |
644 | need_upper_bound_check_)) { | |
645 | // TODO remember the iterator is invalidated because of prefix | |
646 | // match. This can avoid the upper level file iterator to falsely | |
647 | // believe the position is the end of the SST file and move to | |
648 | // the first key of the next file. | |
649 | ResetDataIter(); | |
650 | return false; | |
651 | } | |
652 | return true; | |
653 | } | |
654 | ||
655 | void ResetDataIter() { | |
656 | if (block_iter_points_to_real_block_) { | |
657 | if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { | |
658 | block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); | |
659 | } | |
660 | block_iter_.Invalidate(Status::OK()); | |
661 | block_iter_points_to_real_block_ = false; | |
662 | } | |
663 | } | |
664 | ||
665 | void SavePrevIndexValue() { | |
666 | if (block_iter_points_to_real_block_) { | |
667 | // Reseek. If they end up with the same data block, we shouldn't re-fetch | |
668 | // the same data block. | |
669 | prev_index_value_ = index_iter_->value(); | |
670 | } | |
671 | } | |
672 | ||
673 | void InitDataBlock(); | |
674 | void FindKeyForward(); | |
675 | void FindKeyBackward(); | |
676 | ||
677 | private: | |
678 | BlockBasedTable* table_; | |
679 | const ReadOptions read_options_; | |
680 | const InternalKeyComparator& icomp_; | |
494da23a | 681 | UserComparatorWrapper user_comparator_; |
11fdf7f2 TL |
682 | InternalIteratorBase<BlockHandle>* index_iter_; |
683 | PinnedIteratorsManager* pinned_iters_mgr_; | |
684 | TBlockIter block_iter_; | |
685 | bool block_iter_points_to_real_block_; | |
686 | bool is_out_of_bound_ = false; | |
687 | bool check_filter_; | |
688 | // TODO(Zhongyi): pick a better name | |
689 | bool need_upper_bound_check_; | |
690 | const SliceTransform* prefix_extractor_; | |
691 | // If the blocks over which we iterate are index blocks | |
692 | bool is_index_; | |
693 | // If the keys in the blocks over which we iterate include 8 byte sequence | |
694 | bool key_includes_seq_; | |
695 | bool index_key_is_full_; | |
696 | // If this iterator is created for compaction | |
697 | bool for_compaction_; | |
698 | BlockHandle prev_index_value_; | |
699 | ||
700 | static const size_t kInitReadaheadSize = 8 * 1024; | |
701 | // Found that 256 KB readahead size provides the best performance, based on | |
702 | // experiments. | |
703 | static const size_t kMaxReadaheadSize; | |
704 | size_t readahead_size_ = kInitReadaheadSize; | |
705 | size_t readahead_limit_ = 0; | |
706 | int num_file_reads_ = 0; | |
707 | std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_; | |
7c673cae FG |
708 | }; |
709 | ||
710 | } // namespace rocksdb |