]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based/block_based_table_reader.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_table_reader.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #pragma once
11
12 #include <cstdint>
13 #include <memory>
14
15 #include "cache/cache_entry_roles.h"
16 #include "cache/cache_key.h"
17 #include "cache/cache_reservation_manager.h"
18 #include "db/range_tombstone_fragmenter.h"
19 #include "file/filename.h"
20 #include "rocksdb/slice_transform.h"
21 #include "rocksdb/table_properties.h"
22 #include "table/block_based/block.h"
23 #include "table/block_based/block_based_table_factory.h"
24 #include "table/block_based/block_type.h"
25 #include "table/block_based/cachable_entry.h"
26 #include "table/block_based/filter_block.h"
27 #include "table/block_based/uncompression_dict_reader.h"
28 #include "table/format.h"
29 #include "table/persistent_cache_options.h"
30 #include "table/table_properties_internal.h"
31 #include "table/table_reader.h"
32 #include "table/two_level_iterator.h"
33 #include "trace_replay/block_cache_tracer.h"
34 #include "util/coro_utils.h"
35 #include "util/hash_containers.h"
36
37 namespace ROCKSDB_NAMESPACE {
38
39 class Cache;
40 class FilterBlockReader;
41 class FullFilterBlockReader;
42 class Footer;
43 class InternalKeyComparator;
44 class Iterator;
45 class FSRandomAccessFile;
46 class TableCache;
47 class TableReader;
48 class WritableFile;
49 struct BlockBasedTableOptions;
50 struct EnvOptions;
51 struct ReadOptions;
52 class GetContext;
53
54 using KVPairBlock = std::vector<std::pair<std::string, std::string>>;
55
56 // Reader class for BlockBasedTable format.
57 // For the format of BlockBasedTable refer to
58 // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
59 // This is the default table type. Data is chucked into fixed size blocks and
60 // each block in-turn stores entries. When storing data, we can compress and/or
61 // encode data efficiently within a block, which often results in a much smaller
62 // data size compared with the raw data size. As for the record retrieval, we'll
63 // first locate the block where target record may reside, then read the block to
64 // memory, and finally search that record within the block. Of course, to avoid
65 // frequent reads of the same block, we introduced the block cache to keep the
66 // loaded blocks in the memory.
67 class BlockBasedTable : public TableReader {
68 public:
69 static const std::string kObsoleteFilterBlockPrefix;
70 static const std::string kFullFilterBlockPrefix;
71 static const std::string kPartitionedFilterBlockPrefix;
72
73 // 1-byte compression type + 32-bit checksum
74 static constexpr size_t kBlockTrailerSize = 5;
75
76 // Attempt to open the table that is stored in bytes [0..file_size)
77 // of "file", and read the metadata entries necessary to allow
78 // retrieving data from the table.
79 //
80 // If successful, returns ok and sets "*table_reader" to the newly opened
81 // table. The client should delete "*table_reader" when no longer needed.
82 // If there was an error while initializing the table, sets "*table_reader"
83 // to nullptr and returns a non-ok status.
84 //
85 // @param file must remain live while this Table is in use.
86 // @param prefetch_index_and_filter_in_cache can be used to disable
87 // prefetching of
88 // index and filter blocks into block cache at startup
89 // @param skip_filters Disables loading/accessing the filter block. Overrides
90 // prefetch_index_and_filter_in_cache, so filter will be skipped if both
91 // are set.
92 // @param force_direct_prefetch if true, always prefetching to RocksDB
93 // buffer, rather than calling RandomAccessFile::Prefetch().
94 static Status Open(
95 const ReadOptions& ro, const ImmutableOptions& ioptions,
96 const EnvOptions& env_options,
97 const BlockBasedTableOptions& table_options,
98 const InternalKeyComparator& internal_key_comparator,
99 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
100 std::unique_ptr<TableReader>* table_reader,
101 std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
102 nullptr,
103 const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
104 bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
105 int level = -1, const bool immortal_table = false,
106 const SequenceNumber largest_seqno = 0,
107 bool force_direct_prefetch = false,
108 TailPrefetchStats* tail_prefetch_stats = nullptr,
109 BlockCacheTracer* const block_cache_tracer = nullptr,
110 size_t max_file_size_for_l0_meta_pin = 0,
111 const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0,
112 UniqueId64x2 expected_unique_id = {});
113
114 bool PrefixRangeMayMatch(const Slice& internal_key,
115 const ReadOptions& read_options,
116 const SliceTransform* options_prefix_extractor,
117 const bool need_upper_bound_check,
118 BlockCacheLookupContext* lookup_context) const;
119
120 // Returns a new iterator over the table contents.
121 // The result of NewIterator() is initially invalid (caller must
122 // call one of the Seek methods on the iterator before using it).
123 // @param read_options Must outlive the returned iterator.
124 // @param skip_filters Disables loading/accessing the filter block
125 // compaction_readahead_size: its value will only be used if caller =
126 // kCompaction.
127 InternalIterator* NewIterator(const ReadOptions&,
128 const SliceTransform* prefix_extractor,
129 Arena* arena, bool skip_filters,
130 TableReaderCaller caller,
131 size_t compaction_readahead_size = 0,
132 bool allow_unprepared_value = false) override;
133
134 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
135 const ReadOptions& read_options) override;
136
137 // @param skip_filters Disables loading/accessing the filter block
138 Status Get(const ReadOptions& readOptions, const Slice& key,
139 GetContext* get_context, const SliceTransform* prefix_extractor,
140 bool skip_filters = false) override;
141
142 Status MultiGetFilter(const ReadOptions& read_options,
143 const SliceTransform* prefix_extractor,
144 MultiGetRange* mget_range) override;
145
146 DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet,
147 const ReadOptions& readOptions,
148 const MultiGetContext::Range* mget_range,
149 const SliceTransform* prefix_extractor,
150 bool skip_filters = false);
151
152 // Pre-fetch the disk blocks that correspond to the key range specified by
153 // (kbegin, kend). The call will return error status in the event of
154 // IO or iteration error.
155 Status Prefetch(const Slice* begin, const Slice* end) override;
156
157 // Given a key, return an approximate byte offset in the file where
158 // the data for that key begins (or would begin if the key were
159 // present in the file). The returned value is in terms of file
160 // bytes, and so includes effects like compression of the underlying data.
161 // E.g., the approximate offset of the last key in the table will
162 // be close to the file length.
163 uint64_t ApproximateOffsetOf(const Slice& key,
164 TableReaderCaller caller) override;
165
166 // Given start and end keys, return the approximate data size in the file
167 // between the keys. The returned value is in terms of file bytes, and so
168 // includes effects like compression of the underlying data.
169 // The start key must not be greater than the end key.
170 uint64_t ApproximateSize(const Slice& start, const Slice& end,
171 TableReaderCaller caller) override;
172
173 Status ApproximateKeyAnchors(const ReadOptions& read_options,
174 std::vector<Anchor>& anchors) override;
175
176 bool TEST_BlockInCache(const BlockHandle& handle) const;
177
178 // Returns true if the block for the specified key is in cache.
179 // REQUIRES: key is in this table && block cache enabled
180 bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
181
182 // Set up the table for Compaction. Might change some parameters with
183 // posix_fadvise
184 void SetupForCompaction() override;
185
186 std::shared_ptr<const TableProperties> GetTableProperties() const override;
187
188 size_t ApproximateMemoryUsage() const override;
189
190 // convert SST file to a human readable form
191 Status DumpTable(WritableFile* out_file) override;
192
193 Status VerifyChecksum(const ReadOptions& readOptions,
194 TableReaderCaller caller) override;
195
196 ~BlockBasedTable();
197
198 bool TEST_FilterBlockInCache() const;
199 bool TEST_IndexBlockInCache() const;
200
201 // IndexReader is the interface that provides the functionality for index
202 // access.
203 class IndexReader {
204 public:
205 virtual ~IndexReader() = default;
206
207 // Create an iterator for index access. If iter is null, then a new object
208 // is created on the heap, and the callee will have the ownership.
209 // If a non-null iter is passed in, it will be used, and the returned value
210 // is either the same as iter or a new on-heap object that
211 // wraps the passed iter. In the latter case the return value points
212 // to a different object then iter, and the callee has the ownership of the
213 // returned object.
214 virtual InternalIteratorBase<IndexValue>* NewIterator(
215 const ReadOptions& read_options, bool disable_prefix_seek,
216 IndexBlockIter* iter, GetContext* get_context,
217 BlockCacheLookupContext* lookup_context) = 0;
218
219 // Report an approximation of how much memory has been used other than
220 // memory that was allocated in block cache.
221 virtual size_t ApproximateMemoryUsage() const = 0;
222 // Cache the dependencies of the index reader (e.g. the partitions
223 // of a partitioned index).
224 virtual Status CacheDependencies(const ReadOptions& /*ro*/,
225 bool /* pin */) {
226 return Status::OK();
227 }
228 };
229
230 class IndexReaderCommon;
231
232 static void SetupBaseCacheKey(const TableProperties* properties,
233 const std::string& cur_db_session_id,
234 uint64_t cur_file_number,
235 OffsetableCacheKey* out_base_cache_key,
236 bool* out_is_stable = nullptr);
237
238 static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key,
239 const BlockHandle& handle);
240
241 static void UpdateCacheInsertionMetrics(BlockType block_type,
242 GetContext* get_context, size_t usage,
243 bool redundant,
244 Statistics* const statistics);
245
246 // Get the size to read from storage for a BlockHandle. size_t because we
247 // are about to load into memory.
248 static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) {
249 return static_cast<size_t>(handle.size() + kBlockTrailerSize);
250 }
251
252 // It is the caller's responsibility to make sure that this is called with
253 // block-based table serialized block contents, which contains the compression
254 // byte in the trailer after `block_size`.
255 static inline CompressionType GetBlockCompressionType(const char* block_data,
256 size_t block_size) {
257 return static_cast<CompressionType>(block_data[block_size]);
258 }
259 static inline CompressionType GetBlockCompressionType(
260 const BlockContents& contents) {
261 assert(contents.has_trailer);
262 return GetBlockCompressionType(contents.data.data(), contents.data.size());
263 }
264
265 // Retrieve all key value pairs from data blocks in the table.
266 // The key retrieved are internal keys.
267 Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
268
269 struct Rep;
270
271 Rep* get_rep() { return rep_; }
272 const Rep* get_rep() const { return rep_; }
273
274 // input_iter: if it is not null, update this one and return it as Iterator
275 template <typename TBlockIter>
276 TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
277 const BlockHandle& block_handle,
278 TBlockIter* input_iter, BlockType block_type,
279 GetContext* get_context,
280 BlockCacheLookupContext* lookup_context,
281 FilePrefetchBuffer* prefetch_buffer,
282 bool for_compaction, bool async_read,
283 Status& s) const;
284
285 // input_iter: if it is not null, update this one and return it as Iterator
286 template <typename TBlockIter>
287 TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
288 CachableEntry<Block>& block,
289 TBlockIter* input_iter, Status s) const;
290
291 class PartitionedIndexIteratorState;
292
293 template <typename TBlocklike>
294 friend class FilterBlockReaderCommon;
295
296 friend class PartitionIndexReader;
297
298 friend class UncompressionDictReader;
299
300 protected:
301 Rep* rep_;
302 explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
303 : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
304 // No copying allowed
305 explicit BlockBasedTable(const TableReader&) = delete;
306 void operator=(const TableReader&) = delete;
307
308 private:
309 friend class MockedBlockBasedTable;
310 friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
311 BlockCacheTracer* const block_cache_tracer_;
312
313 void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
314 size_t usage) const;
315 void UpdateCacheMissMetrics(BlockType block_type,
316 GetContext* get_context) const;
317
318 Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier,
319 Cache* block_cache, const Slice& key,
320 BlockType block_type, const bool wait,
321 GetContext* get_context,
322 const Cache::CacheItemHelper* cache_helper,
323 const Cache::CreateCallback& create_cb,
324 Cache::Priority priority) const;
325
326 template <typename TBlocklike>
327 Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache,
328 const Slice& key,
329 const Cache::CacheItemHelper* cache_helper,
330 std::unique_ptr<TBlocklike>&& block_holder,
331 size_t charge, Cache::Handle** cache_handle,
332 Cache::Priority priority) const;
333
334 // Either Block::NewDataIterator() or Block::NewIndexIterator().
335 template <typename TBlockIter>
336 static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
337 BlockType block_type,
338 TBlockIter* input_iter,
339 bool block_contents_pinned);
340
341 // If block cache enabled (compressed or uncompressed), looks for the block
342 // identified by handle in (1) uncompressed cache, (2) compressed cache, and
343 // then (3) file. If found, inserts into the cache(s) that were searched
344 // unsuccessfully (e.g., if found in file, will add to both uncompressed and
345 // compressed caches if they're enabled).
346 //
347 // @param block_entry value is set to the uncompressed block if found. If
348 // in uncompressed block cache, also sets cache_handle to reference that
349 // block.
350 template <typename TBlocklike>
351 Status MaybeReadBlockAndLoadToCache(
352 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
353 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
354 const bool wait, const bool for_compaction,
355 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
356 GetContext* get_context, BlockCacheLookupContext* lookup_context,
357 BlockContents* contents, bool async_read) const;
358
359 // Similar to the above, with one crucial difference: it will retrieve the
360 // block from the file even if there are no caches configured (assuming the
361 // read options allow I/O).
362 template <typename TBlocklike>
363 Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
364 const ReadOptions& ro, const BlockHandle& handle,
365 const UncompressionDict& uncompression_dict,
366 CachableEntry<TBlocklike>* block_entry,
367 BlockType block_type, GetContext* get_context,
368 BlockCacheLookupContext* lookup_context,
369 bool for_compaction, bool use_cache, bool wait_for_cache,
370 bool async_read) const;
371
372 DECLARE_SYNC_AND_ASYNC_CONST(
373 void, RetrieveMultipleBlocks, const ReadOptions& options,
374 const MultiGetRange* batch,
375 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
376 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
377 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
378 results,
379 char* scratch, const UncompressionDict& uncompression_dict);
380
381 // Get the iterator from the index reader.
382 //
383 // If input_iter is not set, return a new Iterator.
384 // If input_iter is set, try to update it and return it as Iterator.
385 // However note that in some cases the returned iterator may be different
386 // from input_iter. In such case the returned iterator should be freed.
387 //
388 // Note: ErrorIterator with Status::Incomplete shall be returned if all the
389 // following conditions are met:
390 // 1. We enabled table_options.cache_index_and_filter_blocks.
391 // 2. index is not present in block cache.
392 // 3. We disallowed any io to be performed, that is, read_options ==
393 // kBlockCacheTier
394 InternalIteratorBase<IndexValue>* NewIndexIterator(
395 const ReadOptions& read_options, bool need_upper_bound_check,
396 IndexBlockIter* input_iter, GetContext* get_context,
397 BlockCacheLookupContext* lookup_context) const;
398
399 // Read block cache from block caches (if set): block_cache and
400 // block_cache_compressed.
401 // On success, Status::OK with be returned and @block will be populated with
402 // pointer to the block as well as its block handle.
403 // @param uncompression_dict Data for presetting the compression library's
404 // dictionary.
405 template <typename TBlocklike>
406 Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache,
407 Cache* block_cache_compressed,
408 const ReadOptions& read_options,
409 CachableEntry<TBlocklike>* block,
410 const UncompressionDict& uncompression_dict,
411 BlockType block_type, const bool wait,
412 GetContext* get_context) const;
413
414 // Put a maybe compressed block to the corresponding block caches.
415 // This method will perform decompression against block_contents if needed
416 // and then populate the block caches.
417 // On success, Status::OK will be returned; also @block will be populated with
418 // uncompressed block and its cache handle.
419 //
420 // Allocated memory managed by block_contents will be transferred to
421 // PutDataBlockToCache(). After the call, the object will be invalid.
422 // @param uncompression_dict Data for presetting the compression library's
423 // dictionary.
424 template <typename TBlocklike>
425 Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache,
426 Cache* block_cache_compressed,
427 CachableEntry<TBlocklike>* cached_block,
428 BlockContents&& block_contents,
429 CompressionType block_comp_type,
430 const UncompressionDict& uncompression_dict,
431 MemoryAllocator* memory_allocator,
432 BlockType block_type,
433 GetContext* get_context) const;
434
435 // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
436 // after a call to Seek(key), until handle_result returns false.
437 // May not make such a call if filter policy says that key is not present.
438 friend class TableCache;
439 friend class BlockBasedTableBuilder;
440
441 // Create a index reader based on the index type stored in the table.
442 // Optionally, user can pass a preloaded meta_index_iter for the index that
443 // need to access extra meta blocks for index construction. This parameter
444 // helps avoid re-reading meta index block if caller already created one.
445 Status CreateIndexReader(const ReadOptions& ro,
446 FilePrefetchBuffer* prefetch_buffer,
447 InternalIterator* preloaded_meta_index_iter,
448 bool use_cache, bool prefetch, bool pin,
449 BlockCacheLookupContext* lookup_context,
450 std::unique_ptr<IndexReader>* index_reader);
451
452 bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key,
453 const bool no_io,
454 const SliceTransform* prefix_extractor,
455 GetContext* get_context,
456 BlockCacheLookupContext* lookup_context,
457 Env::IOPriority rate_limiter_priority) const;
458
459 void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range,
460 const bool no_io,
461 const SliceTransform* prefix_extractor,
462 BlockCacheLookupContext* lookup_context,
463 Env::IOPriority rate_limiter_priority) const;
464
465 // If force_direct_prefetch is true, always prefetching to RocksDB
466 // buffer, rather than calling RandomAccessFile::Prefetch().
467 static Status PrefetchTail(
468 const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
469 bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
470 const bool prefetch_all, const bool preload_all,
471 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
472 Status ReadMetaIndexBlock(const ReadOptions& ro,
473 FilePrefetchBuffer* prefetch_buffer,
474 std::unique_ptr<Block>* metaindex_block,
475 std::unique_ptr<InternalIterator>* iter);
476 Status ReadPropertiesBlock(const ReadOptions& ro,
477 FilePrefetchBuffer* prefetch_buffer,
478 InternalIterator* meta_iter,
479 const SequenceNumber largest_seqno);
480 Status ReadRangeDelBlock(const ReadOptions& ro,
481 FilePrefetchBuffer* prefetch_buffer,
482 InternalIterator* meta_iter,
483 const InternalKeyComparator& internal_comparator,
484 BlockCacheLookupContext* lookup_context);
485 Status PrefetchIndexAndFilterBlocks(
486 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
487 InternalIterator* meta_iter, BlockBasedTable* new_table,
488 bool prefetch_all, const BlockBasedTableOptions& table_options,
489 const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
490 BlockCacheLookupContext* lookup_context);
491
492 static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
493
494 Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
495 Status VerifyChecksumInBlocks(const ReadOptions& read_options,
496 InternalIteratorBase<IndexValue>* index_iter);
497
498 // Create the filter from the filter block.
499 std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
500 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
501 bool use_cache, bool prefetch, bool pin,
502 BlockCacheLookupContext* lookup_context);
503
504 // Size of all data blocks, maybe approximate
505 uint64_t GetApproximateDataSize();
506
507 // Given an iterator return its offset in data block section of file.
508 uint64_t ApproximateDataOffsetOf(
509 const InternalIteratorBase<IndexValue>& index_iter,
510 uint64_t data_size) const;
511
512 // Helper functions for DumpTable()
513 Status DumpIndexBlock(std::ostream& out_stream);
514 Status DumpDataBlocks(std::ostream& out_stream);
515 void DumpKeyValue(const Slice& key, const Slice& value,
516 std::ostream& out_stream);
517
518 // Returns false if prefix_extractor exists and is compatible with that used
519 // in building the table file, otherwise true.
520 bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
521
522 // A cumulative data block file read in MultiGet lower than this size will
523 // use a stack buffer
524 static constexpr size_t kMultiGetReadStackBufSize = 8192;
525
526 friend class PartitionedFilterBlockReader;
527 friend class PartitionedFilterBlockTest;
528 friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
529 };
530
531 // Maintaining state of a two-level iteration on a partitioned index structure.
532 class BlockBasedTable::PartitionedIndexIteratorState
533 : public TwoLevelIteratorState {
534 public:
535 PartitionedIndexIteratorState(
536 const BlockBasedTable* table,
537 UnorderedMap<uint64_t, CachableEntry<Block>>* block_map);
538 InternalIteratorBase<IndexValue>* NewSecondaryIterator(
539 const BlockHandle& index_value) override;
540
541 private:
542 // Don't own table_
543 const BlockBasedTable* table_;
544 UnorderedMap<uint64_t, CachableEntry<Block>>* block_map_;
545 };
546
547 // Stores all the properties associated with a BlockBasedTable.
548 // These are immutable.
549 struct BlockBasedTable::Rep {
550 Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
551 const BlockBasedTableOptions& _table_opt,
552 const InternalKeyComparator& _internal_comparator, bool skip_filters,
553 uint64_t _file_size, int _level, const bool _immortal_table)
554 : ioptions(_ioptions),
555 env_options(_env_options),
556 table_options(_table_opt),
557 filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
558 internal_comparator(_internal_comparator),
559 filter_type(FilterType::kNoFilter),
560 index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
561 whole_key_filtering(_table_opt.whole_key_filtering),
562 prefix_filtering(true),
563 global_seqno(kDisableGlobalSequenceNumber),
564 file_size(_file_size),
565 level(_level),
566 immortal_table(_immortal_table) {}
567 ~Rep() { status.PermitUncheckedError(); }
568 const ImmutableOptions& ioptions;
569 const EnvOptions& env_options;
570 const BlockBasedTableOptions table_options;
571 const FilterPolicy* const filter_policy;
572 const InternalKeyComparator& internal_comparator;
573 Status status;
574 std::unique_ptr<RandomAccessFileReader> file;
575 OffsetableCacheKey base_cache_key;
576 PersistentCacheOptions persistent_cache_options;
577
578 // Footer contains the fixed table information
579 Footer footer;
580
581 std::unique_ptr<IndexReader> index_reader;
582 std::unique_ptr<FilterBlockReader> filter;
583 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
584
585 enum class FilterType {
586 kNoFilter,
587 kFullFilter,
588 kPartitionedFilter,
589 };
590 FilterType filter_type;
591 BlockHandle filter_handle;
592 BlockHandle compression_dict_handle;
593
594 std::shared_ptr<const TableProperties> table_properties;
595 BlockBasedTableOptions::IndexType index_type;
596 bool whole_key_filtering;
597 bool prefix_filtering;
598 std::shared_ptr<const SliceTransform> table_prefix_extractor;
599
600 std::shared_ptr<FragmentedRangeTombstoneList> fragmented_range_dels;
601
602 // If global_seqno is used, all Keys in this file will have the same
603 // seqno with value `global_seqno`.
604 //
605 // A value of kDisableGlobalSequenceNumber means that this feature is disabled
606 // and every key have it's own seqno.
607 SequenceNumber global_seqno;
608
609 // Size of the table file on disk
610 uint64_t file_size;
611
612 // the level when the table is opened, could potentially change when trivial
613 // move is involved
614 int level;
615
616 // If false, blocks in this file are definitely all uncompressed. Knowing this
617 // before reading individual blocks enables certain optimizations.
618 bool blocks_maybe_compressed = true;
619
620 // If true, data blocks in this file are definitely ZSTD compressed. If false
621 // they might not be. When false we skip creating a ZSTD digested
622 // uncompression dictionary. Even if we get a false negative, things should
623 // still work, just not as quickly.
624 bool blocks_definitely_zstd_compressed = false;
625
626 // These describe how index is encoded.
627 bool index_has_first_key = false;
628 bool index_key_includes_seq = true;
629 bool index_value_is_full = true;
630
631 const bool immortal_table;
632
633 std::unique_ptr<CacheReservationManager::CacheReservationHandle>
634 table_reader_cache_res_handle = nullptr;
635
636 SequenceNumber get_global_seqno(BlockType block_type) const {
637 return (block_type == BlockType::kFilterPartitionIndex ||
638 block_type == BlockType::kCompressionDictionary)
639 ? kDisableGlobalSequenceNumber
640 : global_seqno;
641 }
642
643 uint64_t cf_id_for_tracing() const {
644 return table_properties
645 ? table_properties->column_family_id
646 : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
647 kUnknownColumnFamily;
648 }
649
650 Slice cf_name_for_tracing() const {
651 return table_properties ? table_properties->column_family_name
652 : BlockCacheTraceHelper::kUnknownColumnFamilyName;
653 }
654
655 uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
656
657 uint64_t sst_number_for_tracing() const {
658 return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
659 }
660 void CreateFilePrefetchBuffer(
661 size_t readahead_size, size_t max_readahead_size,
662 std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
663 uint64_t num_file_reads,
664 uint64_t num_file_reads_for_auto_readahead) const {
665 fpb->reset(new FilePrefetchBuffer(
666 readahead_size, max_readahead_size,
667 !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */,
668 implicit_auto_readahead, num_file_reads,
669 num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock,
670 ioptions.stats));
671 }
672
673 void CreateFilePrefetchBufferIfNotExists(
674 size_t readahead_size, size_t max_readahead_size,
675 std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
676 uint64_t num_file_reads,
677 uint64_t num_file_reads_for_auto_readahead) const {
678 if (!(*fpb)) {
679 CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
680 implicit_auto_readahead, num_file_reads,
681 num_file_reads_for_auto_readahead);
682 }
683 }
684
685 std::size_t ApproximateMemoryUsage() const {
686 std::size_t usage = 0;
687 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
688 usage += malloc_usable_size(const_cast<BlockBasedTable::Rep*>(this));
689 #else
690 usage += sizeof(*this);
691 #endif // ROCKSDB_MALLOC_USABLE_SIZE
692 return usage;
693 }
694 };
695
696 // This is an adapter class for `WritableFile` to be used for `std::ostream`.
697 // The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
698 // constructor for storing streaming data.
699 // Note:
700 // * This adapter doesn't provide any buffering, each write is forwarded to
701 // `WritableFile->Append()` directly.
702 // * For a failed write, the user needs to check the status by `ostream.good()`
703 class WritableFileStringStreamAdapter : public std::stringbuf {
704 public:
705 explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
706 : file_(writable_file) {}
707
708 // Override overflow() to handle `sputc()`. There are cases that will not go
709 // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
710 // `os.put()` directly and will call `sputc()` By internal implementation:
711 // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character
712 // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
713 // overflow(_Traits::to_int_type(_Ch));
714 // }
715 // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
716 // not captured by xsputn(), becomes an overflow here.
717 int overflow(int ch = EOF) override {
718 if (ch != EOF) {
719 Status s = file_->Append(Slice((char*)&ch, 1));
720 if (s.ok()) {
721 return ch;
722 }
723 }
724 return EOF;
725 }
726
727 std::streamsize xsputn(char const* p, std::streamsize n) override {
728 Status s = file_->Append(Slice(p, n));
729 if (!s.ok()) {
730 return 0;
731 }
732 return n;
733 }
734
735 private:
736 WritableFile* file_;
737 };
738
739 } // namespace ROCKSDB_NAMESPACE