1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
15 #include "cache/cache_entry_roles.h"
16 #include "cache/cache_key.h"
17 #include "cache/cache_reservation_manager.h"
18 #include "db/range_tombstone_fragmenter.h"
19 #include "file/filename.h"
20 #include "rocksdb/slice_transform.h"
21 #include "rocksdb/table_properties.h"
22 #include "table/block_based/block.h"
23 #include "table/block_based/block_based_table_factory.h"
24 #include "table/block_based/block_type.h"
25 #include "table/block_based/cachable_entry.h"
26 #include "table/block_based/filter_block.h"
27 #include "table/block_based/uncompression_dict_reader.h"
28 #include "table/format.h"
29 #include "table/persistent_cache_options.h"
30 #include "table/table_properties_internal.h"
31 #include "table/table_reader.h"
32 #include "table/two_level_iterator.h"
33 #include "trace_replay/block_cache_tracer.h"
34 #include "util/coro_utils.h"
35 #include "util/hash_containers.h"
37 namespace ROCKSDB_NAMESPACE
{
40 class FilterBlockReader
;
41 class FullFilterBlockReader
;
43 class InternalKeyComparator
;
45 class FSRandomAccessFile
;
49 struct BlockBasedTableOptions
;
54 using KVPairBlock
= std::vector
<std::pair
<std::string
, std::string
>>;
56 // Reader class for BlockBasedTable format.
57 // For the format of BlockBasedTable refer to
58 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
59 // This is the default table type. Data is chucked into fixed size blocks and
60 // each block in-turn stores entries. When storing data, we can compress and/or
61 // encode data efficiently within a block, which often results in a much smaller
62 // data size compared with the raw data size. As for the record retrieval, we'll
63 // first locate the block where target record may reside, then read the block to
64 // memory, and finally search that record within the block. Of course, to avoid
65 // frequent reads of the same block, we introduced the block cache to keep the
66 // loaded blocks in the memory.
67 class BlockBasedTable
: public TableReader
{
69 static const std::string kObsoleteFilterBlockPrefix
;
70 static const std::string kFullFilterBlockPrefix
;
71 static const std::string kPartitionedFilterBlockPrefix
;
73 // 1-byte compression type + 32-bit checksum
74 static constexpr size_t kBlockTrailerSize
= 5;
76 // Attempt to open the table that is stored in bytes [0..file_size)
77 // of "file", and read the metadata entries necessary to allow
78 // retrieving data from the table.
80 // If successful, returns ok and sets "*table_reader" to the newly opened
81 // table. The client should delete "*table_reader" when no longer needed.
82 // If there was an error while initializing the table, sets "*table_reader"
83 // to nullptr and returns a non-ok status.
85 // @param file must remain live while this Table is in use.
86 // @param prefetch_index_and_filter_in_cache can be used to disable
88 // index and filter blocks into block cache at startup
89 // @param skip_filters Disables loading/accessing the filter block. Overrides
90 // prefetch_index_and_filter_in_cache, so filter will be skipped if both
92 // @param force_direct_prefetch if true, always prefetching to RocksDB
93 // buffer, rather than calling RandomAccessFile::Prefetch().
95 const ReadOptions
& ro
, const ImmutableOptions
& ioptions
,
96 const EnvOptions
& env_options
,
97 const BlockBasedTableOptions
& table_options
,
98 const InternalKeyComparator
& internal_key_comparator
,
99 std::unique_ptr
<RandomAccessFileReader
>&& file
, uint64_t file_size
,
100 std::unique_ptr
<TableReader
>* table_reader
,
101 std::shared_ptr
<CacheReservationManager
> table_reader_cache_res_mgr
=
103 const std::shared_ptr
<const SliceTransform
>& prefix_extractor
= nullptr,
104 bool prefetch_index_and_filter_in_cache
= true, bool skip_filters
= false,
105 int level
= -1, const bool immortal_table
= false,
106 const SequenceNumber largest_seqno
= 0,
107 bool force_direct_prefetch
= false,
108 TailPrefetchStats
* tail_prefetch_stats
= nullptr,
109 BlockCacheTracer
* const block_cache_tracer
= nullptr,
110 size_t max_file_size_for_l0_meta_pin
= 0,
111 const std::string
& cur_db_session_id
= "", uint64_t cur_file_num
= 0,
112 UniqueId64x2 expected_unique_id
= {});
114 bool PrefixRangeMayMatch(const Slice
& internal_key
,
115 const ReadOptions
& read_options
,
116 const SliceTransform
* options_prefix_extractor
,
117 const bool need_upper_bound_check
,
118 BlockCacheLookupContext
* lookup_context
) const;
120 // Returns a new iterator over the table contents.
121 // The result of NewIterator() is initially invalid (caller must
122 // call one of the Seek methods on the iterator before using it).
123 // @param read_options Must outlive the returned iterator.
124 // @param skip_filters Disables loading/accessing the filter block
125 // compaction_readahead_size: its value will only be used if caller =
127 InternalIterator
* NewIterator(const ReadOptions
&,
128 const SliceTransform
* prefix_extractor
,
129 Arena
* arena
, bool skip_filters
,
130 TableReaderCaller caller
,
131 size_t compaction_readahead_size
= 0,
132 bool allow_unprepared_value
= false) override
;
134 FragmentedRangeTombstoneIterator
* NewRangeTombstoneIterator(
135 const ReadOptions
& read_options
) override
;
137 // @param skip_filters Disables loading/accessing the filter block
138 Status
Get(const ReadOptions
& readOptions
, const Slice
& key
,
139 GetContext
* get_context
, const SliceTransform
* prefix_extractor
,
140 bool skip_filters
= false) override
;
142 Status
MultiGetFilter(const ReadOptions
& read_options
,
143 const SliceTransform
* prefix_extractor
,
144 MultiGetRange
* mget_range
) override
;
146 DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet
,
147 const ReadOptions
& readOptions
,
148 const MultiGetContext::Range
* mget_range
,
149 const SliceTransform
* prefix_extractor
,
150 bool skip_filters
= false);
152 // Pre-fetch the disk blocks that correspond to the key range specified by
153 // (kbegin, kend). The call will return error status in the event of
154 // IO or iteration error.
155 Status
Prefetch(const Slice
* begin
, const Slice
* end
) override
;
157 // Given a key, return an approximate byte offset in the file where
158 // the data for that key begins (or would begin if the key were
159 // present in the file). The returned value is in terms of file
160 // bytes, and so includes effects like compression of the underlying data.
161 // E.g., the approximate offset of the last key in the table will
162 // be close to the file length.
163 uint64_t ApproximateOffsetOf(const Slice
& key
,
164 TableReaderCaller caller
) override
;
166 // Given start and end keys, return the approximate data size in the file
167 // between the keys. The returned value is in terms of file bytes, and so
168 // includes effects like compression of the underlying data.
169 // The start key must not be greater than the end key.
170 uint64_t ApproximateSize(const Slice
& start
, const Slice
& end
,
171 TableReaderCaller caller
) override
;
173 Status
ApproximateKeyAnchors(const ReadOptions
& read_options
,
174 std::vector
<Anchor
>& anchors
) override
;
176 bool TEST_BlockInCache(const BlockHandle
& handle
) const;
178 // Returns true if the block for the specified key is in cache.
179 // REQUIRES: key is in this table && block cache enabled
180 bool TEST_KeyInCache(const ReadOptions
& options
, const Slice
& key
);
182 // Set up the table for Compaction. Might change some parameters with
184 void SetupForCompaction() override
;
186 std::shared_ptr
<const TableProperties
> GetTableProperties() const override
;
188 size_t ApproximateMemoryUsage() const override
;
190 // convert SST file to a human readable form
191 Status
DumpTable(WritableFile
* out_file
) override
;
193 Status
VerifyChecksum(const ReadOptions
& readOptions
,
194 TableReaderCaller caller
) override
;
198 bool TEST_FilterBlockInCache() const;
199 bool TEST_IndexBlockInCache() const;
201 // IndexReader is the interface that provides the functionality for index
205 virtual ~IndexReader() = default;
207 // Create an iterator for index access. If iter is null, then a new object
208 // is created on the heap, and the callee will have the ownership.
209 // If a non-null iter is passed in, it will be used, and the returned value
210 // is either the same as iter or a new on-heap object that
211 // wraps the passed iter. In the latter case the return value points
212 // to a different object then iter, and the callee has the ownership of the
214 virtual InternalIteratorBase
<IndexValue
>* NewIterator(
215 const ReadOptions
& read_options
, bool disable_prefix_seek
,
216 IndexBlockIter
* iter
, GetContext
* get_context
,
217 BlockCacheLookupContext
* lookup_context
) = 0;
219 // Report an approximation of how much memory has been used other than
220 // memory that was allocated in block cache.
221 virtual size_t ApproximateMemoryUsage() const = 0;
222 // Cache the dependencies of the index reader (e.g. the partitions
223 // of a partitioned index).
224 virtual Status
CacheDependencies(const ReadOptions
& /*ro*/,
230 class IndexReaderCommon
;
232 static void SetupBaseCacheKey(const TableProperties
* properties
,
233 const std::string
& cur_db_session_id
,
234 uint64_t cur_file_number
,
235 OffsetableCacheKey
* out_base_cache_key
,
236 bool* out_is_stable
= nullptr);
238 static CacheKey
GetCacheKey(const OffsetableCacheKey
& base_cache_key
,
239 const BlockHandle
& handle
);
241 static void UpdateCacheInsertionMetrics(BlockType block_type
,
242 GetContext
* get_context
, size_t usage
,
244 Statistics
* const statistics
);
246 // Get the size to read from storage for a BlockHandle. size_t because we
247 // are about to load into memory.
248 static inline size_t BlockSizeWithTrailer(const BlockHandle
& handle
) {
249 return static_cast<size_t>(handle
.size() + kBlockTrailerSize
);
252 // It is the caller's responsibility to make sure that this is called with
253 // block-based table serialized block contents, which contains the compression
254 // byte in the trailer after `block_size`.
255 static inline CompressionType
GetBlockCompressionType(const char* block_data
,
257 return static_cast<CompressionType
>(block_data
[block_size
]);
259 static inline CompressionType
GetBlockCompressionType(
260 const BlockContents
& contents
) {
261 assert(contents
.has_trailer
);
262 return GetBlockCompressionType(contents
.data
.data(), contents
.data
.size());
265 // Retrieve all key value pairs from data blocks in the table.
266 // The key retrieved are internal keys.
267 Status
GetKVPairsFromDataBlocks(std::vector
<KVPairBlock
>* kv_pair_blocks
);
271 Rep
* get_rep() { return rep_
; }
272 const Rep
* get_rep() const { return rep_
; }
274 // input_iter: if it is not null, update this one and return it as Iterator
275 template <typename TBlockIter
>
276 TBlockIter
* NewDataBlockIterator(const ReadOptions
& ro
,
277 const BlockHandle
& block_handle
,
278 TBlockIter
* input_iter
, BlockType block_type
,
279 GetContext
* get_context
,
280 BlockCacheLookupContext
* lookup_context
,
281 FilePrefetchBuffer
* prefetch_buffer
,
282 bool for_compaction
, bool async_read
,
285 // input_iter: if it is not null, update this one and return it as Iterator
286 template <typename TBlockIter
>
287 TBlockIter
* NewDataBlockIterator(const ReadOptions
& ro
,
288 CachableEntry
<Block
>& block
,
289 TBlockIter
* input_iter
, Status s
) const;
291 class PartitionedIndexIteratorState
;
293 template <typename TBlocklike
>
294 friend class FilterBlockReaderCommon
;
296 friend class PartitionIndexReader
;
298 friend class UncompressionDictReader
;
302 explicit BlockBasedTable(Rep
* rep
, BlockCacheTracer
* const block_cache_tracer
)
303 : rep_(rep
), block_cache_tracer_(block_cache_tracer
) {}
304 // No copying allowed
305 explicit BlockBasedTable(const TableReader
&) = delete;
306 void operator=(const TableReader
&) = delete;
309 friend class MockedBlockBasedTable
;
310 friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test
;
311 BlockCacheTracer
* const block_cache_tracer_
;
313 void UpdateCacheHitMetrics(BlockType block_type
, GetContext
* get_context
,
315 void UpdateCacheMissMetrics(BlockType block_type
,
316 GetContext
* get_context
) const;
318 Cache::Handle
* GetEntryFromCache(const CacheTier
& cache_tier
,
319 Cache
* block_cache
, const Slice
& key
,
320 BlockType block_type
, const bool wait
,
321 GetContext
* get_context
,
322 const Cache::CacheItemHelper
* cache_helper
,
323 const Cache::CreateCallback
& create_cb
,
324 Cache::Priority priority
) const;
326 template <typename TBlocklike
>
327 Status
InsertEntryToCache(const CacheTier
& cache_tier
, Cache
* block_cache
,
329 const Cache::CacheItemHelper
* cache_helper
,
330 std::unique_ptr
<TBlocklike
>&& block_holder
,
331 size_t charge
, Cache::Handle
** cache_handle
,
332 Cache::Priority priority
) const;
334 // Either Block::NewDataIterator() or Block::NewIndexIterator().
335 template <typename TBlockIter
>
336 static TBlockIter
* InitBlockIterator(const Rep
* rep
, Block
* block
,
337 BlockType block_type
,
338 TBlockIter
* input_iter
,
339 bool block_contents_pinned
);
341 // If block cache enabled (compressed or uncompressed), looks for the block
342 // identified by handle in (1) uncompressed cache, (2) compressed cache, and
343 // then (3) file. If found, inserts into the cache(s) that were searched
344 // unsuccessfully (e.g., if found in file, will add to both uncompressed and
345 // compressed caches if they're enabled).
347 // @param block_entry value is set to the uncompressed block if found. If
348 // in uncompressed block cache, also sets cache_handle to reference that
350 template <typename TBlocklike
>
351 Status
MaybeReadBlockAndLoadToCache(
352 FilePrefetchBuffer
* prefetch_buffer
, const ReadOptions
& ro
,
353 const BlockHandle
& handle
, const UncompressionDict
& uncompression_dict
,
354 const bool wait
, const bool for_compaction
,
355 CachableEntry
<TBlocklike
>* block_entry
, BlockType block_type
,
356 GetContext
* get_context
, BlockCacheLookupContext
* lookup_context
,
357 BlockContents
* contents
, bool async_read
) const;
359 // Similar to the above, with one crucial difference: it will retrieve the
360 // block from the file even if there are no caches configured (assuming the
361 // read options allow I/O).
362 template <typename TBlocklike
>
363 Status
RetrieveBlock(FilePrefetchBuffer
* prefetch_buffer
,
364 const ReadOptions
& ro
, const BlockHandle
& handle
,
365 const UncompressionDict
& uncompression_dict
,
366 CachableEntry
<TBlocklike
>* block_entry
,
367 BlockType block_type
, GetContext
* get_context
,
368 BlockCacheLookupContext
* lookup_context
,
369 bool for_compaction
, bool use_cache
, bool wait_for_cache
,
370 bool async_read
) const;
372 DECLARE_SYNC_AND_ASYNC_CONST(
373 void, RetrieveMultipleBlocks
, const ReadOptions
& options
,
374 const MultiGetRange
* batch
,
375 const autovector
<BlockHandle
, MultiGetContext::MAX_BATCH_SIZE
>* handles
,
376 autovector
<Status
, MultiGetContext::MAX_BATCH_SIZE
>* statuses
,
377 autovector
<CachableEntry
<Block
>, MultiGetContext::MAX_BATCH_SIZE
>*
379 char* scratch
, const UncompressionDict
& uncompression_dict
);
381 // Get the iterator from the index reader.
383 // If input_iter is not set, return a new Iterator.
384 // If input_iter is set, try to update it and return it as Iterator.
385 // However note that in some cases the returned iterator may be different
386 // from input_iter. In such case the returned iterator should be freed.
388 // Note: ErrorIterator with Status::Incomplete shall be returned if all the
389 // following conditions are met:
390 // 1. We enabled table_options.cache_index_and_filter_blocks.
391 // 2. index is not present in block cache.
392 // 3. We disallowed any io to be performed, that is, read_options ==
394 InternalIteratorBase
<IndexValue
>* NewIndexIterator(
395 const ReadOptions
& read_options
, bool need_upper_bound_check
,
396 IndexBlockIter
* input_iter
, GetContext
* get_context
,
397 BlockCacheLookupContext
* lookup_context
) const;
399 // Read block cache from block caches (if set): block_cache and
400 // block_cache_compressed.
401 // On success, Status::OK with be returned and @block will be populated with
402 // pointer to the block as well as its block handle.
403 // @param uncompression_dict Data for presetting the compression library's
405 template <typename TBlocklike
>
406 Status
GetDataBlockFromCache(const Slice
& cache_key
, Cache
* block_cache
,
407 Cache
* block_cache_compressed
,
408 const ReadOptions
& read_options
,
409 CachableEntry
<TBlocklike
>* block
,
410 const UncompressionDict
& uncompression_dict
,
411 BlockType block_type
, const bool wait
,
412 GetContext
* get_context
) const;
414 // Put a maybe compressed block to the corresponding block caches.
415 // This method will perform decompression against block_contents if needed
416 // and then populate the block caches.
417 // On success, Status::OK will be returned; also @block will be populated with
418 // uncompressed block and its cache handle.
420 // Allocated memory managed by block_contents will be transferred to
421 // PutDataBlockToCache(). After the call, the object will be invalid.
422 // @param uncompression_dict Data for presetting the compression library's
424 template <typename TBlocklike
>
425 Status
PutDataBlockToCache(const Slice
& cache_key
, Cache
* block_cache
,
426 Cache
* block_cache_compressed
,
427 CachableEntry
<TBlocklike
>* cached_block
,
428 BlockContents
&& block_contents
,
429 CompressionType block_comp_type
,
430 const UncompressionDict
& uncompression_dict
,
431 MemoryAllocator
* memory_allocator
,
432 BlockType block_type
,
433 GetContext
* get_context
) const;
435 // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
436 // after a call to Seek(key), until handle_result returns false.
437 // May not make such a call if filter policy says that key is not present.
438 friend class TableCache
;
439 friend class BlockBasedTableBuilder
;
441 // Create a index reader based on the index type stored in the table.
442 // Optionally, user can pass a preloaded meta_index_iter for the index that
443 // need to access extra meta blocks for index construction. This parameter
444 // helps avoid re-reading meta index block if caller already created one.
445 Status
CreateIndexReader(const ReadOptions
& ro
,
446 FilePrefetchBuffer
* prefetch_buffer
,
447 InternalIterator
* preloaded_meta_index_iter
,
448 bool use_cache
, bool prefetch
, bool pin
,
449 BlockCacheLookupContext
* lookup_context
,
450 std::unique_ptr
<IndexReader
>* index_reader
);
452 bool FullFilterKeyMayMatch(FilterBlockReader
* filter
, const Slice
& user_key
,
454 const SliceTransform
* prefix_extractor
,
455 GetContext
* get_context
,
456 BlockCacheLookupContext
* lookup_context
,
457 Env::IOPriority rate_limiter_priority
) const;
459 void FullFilterKeysMayMatch(FilterBlockReader
* filter
, MultiGetRange
* range
,
461 const SliceTransform
* prefix_extractor
,
462 BlockCacheLookupContext
* lookup_context
,
463 Env::IOPriority rate_limiter_priority
) const;
465 // If force_direct_prefetch is true, always prefetching to RocksDB
466 // buffer, rather than calling RandomAccessFile::Prefetch().
467 static Status
PrefetchTail(
468 const ReadOptions
& ro
, RandomAccessFileReader
* file
, uint64_t file_size
,
469 bool force_direct_prefetch
, TailPrefetchStats
* tail_prefetch_stats
,
470 const bool prefetch_all
, const bool preload_all
,
471 std::unique_ptr
<FilePrefetchBuffer
>* prefetch_buffer
);
472 Status
ReadMetaIndexBlock(const ReadOptions
& ro
,
473 FilePrefetchBuffer
* prefetch_buffer
,
474 std::unique_ptr
<Block
>* metaindex_block
,
475 std::unique_ptr
<InternalIterator
>* iter
);
476 Status
ReadPropertiesBlock(const ReadOptions
& ro
,
477 FilePrefetchBuffer
* prefetch_buffer
,
478 InternalIterator
* meta_iter
,
479 const SequenceNumber largest_seqno
);
480 Status
ReadRangeDelBlock(const ReadOptions
& ro
,
481 FilePrefetchBuffer
* prefetch_buffer
,
482 InternalIterator
* meta_iter
,
483 const InternalKeyComparator
& internal_comparator
,
484 BlockCacheLookupContext
* lookup_context
);
485 Status
PrefetchIndexAndFilterBlocks(
486 const ReadOptions
& ro
, FilePrefetchBuffer
* prefetch_buffer
,
487 InternalIterator
* meta_iter
, BlockBasedTable
* new_table
,
488 bool prefetch_all
, const BlockBasedTableOptions
& table_options
,
489 const int level
, size_t file_size
, size_t max_file_size_for_l0_meta_pin
,
490 BlockCacheLookupContext
* lookup_context
);
492 static BlockType
GetBlockTypeForMetaBlockByName(const Slice
& meta_block_name
);
494 Status
VerifyChecksumInMetaBlocks(InternalIteratorBase
<Slice
>* index_iter
);
495 Status
VerifyChecksumInBlocks(const ReadOptions
& read_options
,
496 InternalIteratorBase
<IndexValue
>* index_iter
);
498 // Create the filter from the filter block.
499 std::unique_ptr
<FilterBlockReader
> CreateFilterBlockReader(
500 const ReadOptions
& ro
, FilePrefetchBuffer
* prefetch_buffer
,
501 bool use_cache
, bool prefetch
, bool pin
,
502 BlockCacheLookupContext
* lookup_context
);
504 // Size of all data blocks, maybe approximate
505 uint64_t GetApproximateDataSize();
507 // Given an iterator return its offset in data block section of file.
508 uint64_t ApproximateDataOffsetOf(
509 const InternalIteratorBase
<IndexValue
>& index_iter
,
510 uint64_t data_size
) const;
512 // Helper functions for DumpTable()
513 Status
DumpIndexBlock(std::ostream
& out_stream
);
514 Status
DumpDataBlocks(std::ostream
& out_stream
);
515 void DumpKeyValue(const Slice
& key
, const Slice
& value
,
516 std::ostream
& out_stream
);
518 // Returns false if prefix_extractor exists and is compatible with that used
519 // in building the table file, otherwise true.
520 bool PrefixExtractorChanged(const SliceTransform
* prefix_extractor
) const;
522 // A cumulative data block file read in MultiGet lower than this size will
523 // use a stack buffer
524 static constexpr size_t kMultiGetReadStackBufSize
= 8192;
526 friend class PartitionedFilterBlockReader
;
527 friend class PartitionedFilterBlockTest
;
528 friend class DBBasicTest_MultiGetIOBufferOverrun_Test
;
531 // Maintaining state of a two-level iteration on a partitioned index structure.
532 class BlockBasedTable::PartitionedIndexIteratorState
533 : public TwoLevelIteratorState
{
535 PartitionedIndexIteratorState(
536 const BlockBasedTable
* table
,
537 UnorderedMap
<uint64_t, CachableEntry
<Block
>>* block_map
);
538 InternalIteratorBase
<IndexValue
>* NewSecondaryIterator(
539 const BlockHandle
& index_value
) override
;
543 const BlockBasedTable
* table_
;
544 UnorderedMap
<uint64_t, CachableEntry
<Block
>>* block_map_
;
547 // Stores all the properties associated with a BlockBasedTable.
548 // These are immutable.
549 struct BlockBasedTable::Rep
{
550 Rep(const ImmutableOptions
& _ioptions
, const EnvOptions
& _env_options
,
551 const BlockBasedTableOptions
& _table_opt
,
552 const InternalKeyComparator
& _internal_comparator
, bool skip_filters
,
553 uint64_t _file_size
, int _level
, const bool _immortal_table
)
554 : ioptions(_ioptions
),
555 env_options(_env_options
),
556 table_options(_table_opt
),
557 filter_policy(skip_filters
? nullptr : _table_opt
.filter_policy
.get()),
558 internal_comparator(_internal_comparator
),
559 filter_type(FilterType::kNoFilter
),
560 index_type(BlockBasedTableOptions::IndexType::kBinarySearch
),
561 whole_key_filtering(_table_opt
.whole_key_filtering
),
562 prefix_filtering(true),
563 global_seqno(kDisableGlobalSequenceNumber
),
564 file_size(_file_size
),
566 immortal_table(_immortal_table
) {}
567 ~Rep() { status
.PermitUncheckedError(); }
568 const ImmutableOptions
& ioptions
;
569 const EnvOptions
& env_options
;
570 const BlockBasedTableOptions table_options
;
571 const FilterPolicy
* const filter_policy
;
572 const InternalKeyComparator
& internal_comparator
;
574 std::unique_ptr
<RandomAccessFileReader
> file
;
575 OffsetableCacheKey base_cache_key
;
576 PersistentCacheOptions persistent_cache_options
;
578 // Footer contains the fixed table information
581 std::unique_ptr
<IndexReader
> index_reader
;
582 std::unique_ptr
<FilterBlockReader
> filter
;
583 std::unique_ptr
<UncompressionDictReader
> uncompression_dict_reader
;
585 enum class FilterType
{
590 FilterType filter_type
;
591 BlockHandle filter_handle
;
592 BlockHandle compression_dict_handle
;
594 std::shared_ptr
<const TableProperties
> table_properties
;
595 BlockBasedTableOptions::IndexType index_type
;
596 bool whole_key_filtering
;
597 bool prefix_filtering
;
598 std::shared_ptr
<const SliceTransform
> table_prefix_extractor
;
600 std::shared_ptr
<FragmentedRangeTombstoneList
> fragmented_range_dels
;
602 // If global_seqno is used, all Keys in this file will have the same
603 // seqno with value `global_seqno`.
605 // A value of kDisableGlobalSequenceNumber means that this feature is disabled
606 // and every key have it's own seqno.
607 SequenceNumber global_seqno
;
609 // Size of the table file on disk
612 // the level when the table is opened, could potentially change when trivial
616 // If false, blocks in this file are definitely all uncompressed. Knowing this
617 // before reading individual blocks enables certain optimizations.
618 bool blocks_maybe_compressed
= true;
620 // If true, data blocks in this file are definitely ZSTD compressed. If false
621 // they might not be. When false we skip creating a ZSTD digested
622 // uncompression dictionary. Even if we get a false negative, things should
623 // still work, just not as quickly.
624 bool blocks_definitely_zstd_compressed
= false;
626 // These describe how index is encoded.
627 bool index_has_first_key
= false;
628 bool index_key_includes_seq
= true;
629 bool index_value_is_full
= true;
631 const bool immortal_table
;
633 std::unique_ptr
<CacheReservationManager::CacheReservationHandle
>
634 table_reader_cache_res_handle
= nullptr;
636 SequenceNumber
get_global_seqno(BlockType block_type
) const {
637 return (block_type
== BlockType::kFilterPartitionIndex
||
638 block_type
== BlockType::kCompressionDictionary
)
639 ? kDisableGlobalSequenceNumber
643 uint64_t cf_id_for_tracing() const {
644 return table_properties
645 ? table_properties
->column_family_id
646 : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
647 kUnknownColumnFamily
;
650 Slice
cf_name_for_tracing() const {
651 return table_properties
? table_properties
->column_family_name
652 : BlockCacheTraceHelper::kUnknownColumnFamilyName
;
655 uint32_t level_for_tracing() const { return level
>= 0 ? level
: UINT32_MAX
; }
657 uint64_t sst_number_for_tracing() const {
658 return file
? TableFileNameToNumber(file
->file_name()) : UINT64_MAX
;
660 void CreateFilePrefetchBuffer(
661 size_t readahead_size
, size_t max_readahead_size
,
662 std::unique_ptr
<FilePrefetchBuffer
>* fpb
, bool implicit_auto_readahead
,
663 uint64_t num_file_reads
,
664 uint64_t num_file_reads_for_auto_readahead
) const {
665 fpb
->reset(new FilePrefetchBuffer(
666 readahead_size
, max_readahead_size
,
667 !ioptions
.allow_mmap_reads
/* enable */, false /* track_min_offset */,
668 implicit_auto_readahead
, num_file_reads
,
669 num_file_reads_for_auto_readahead
, ioptions
.fs
.get(), ioptions
.clock
,
673 void CreateFilePrefetchBufferIfNotExists(
674 size_t readahead_size
, size_t max_readahead_size
,
675 std::unique_ptr
<FilePrefetchBuffer
>* fpb
, bool implicit_auto_readahead
,
676 uint64_t num_file_reads
,
677 uint64_t num_file_reads_for_auto_readahead
) const {
679 CreateFilePrefetchBuffer(readahead_size
, max_readahead_size
, fpb
,
680 implicit_auto_readahead
, num_file_reads
,
681 num_file_reads_for_auto_readahead
);
685 std::size_t ApproximateMemoryUsage() const {
686 std::size_t usage
= 0;
687 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
688 usage
+= malloc_usable_size(const_cast<BlockBasedTable::Rep
*>(this));
690 usage
+= sizeof(*this);
691 #endif // ROCKSDB_MALLOC_USABLE_SIZE
696 // This is an adapter class for `WritableFile` to be used for `std::ostream`.
697 // The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
698 // constructor for storing streaming data.
700 // * This adapter doesn't provide any buffering, each write is forwarded to
701 // `WritableFile->Append()` directly.
702 // * For a failed write, the user needs to check the status by `ostream.good()`
703 class WritableFileStringStreamAdapter
: public std::stringbuf
{
705 explicit WritableFileStringStreamAdapter(WritableFile
* writable_file
)
706 : file_(writable_file
) {}
708 // Override overflow() to handle `sputc()`. There are cases that will not go
709 // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
710 // `os.put()` directly and will call `sputc()` By internal implementation:
711 // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character
712 // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
713 // overflow(_Traits::to_int_type(_Ch));
715 // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
716 // not captured by xsputn(), becomes an overflow here.
717 int overflow(int ch
= EOF
) override
{
719 Status s
= file_
->Append(Slice((char*)&ch
, 1));
727 std::streamsize
xsputn(char const* p
, std::streamsize n
) override
{
728 Status s
= file_
->Append(Slice(p
, n
));
739 } // namespace ROCKSDB_NAMESPACE