]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based_table_reader.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / block_based_table_reader.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <stdint.h>
13#include <memory>
14#include <set>
15#include <string>
16#include <utility>
17#include <vector>
18
494da23a 19#include "db/range_tombstone_fragmenter.h"
7c673cae
FG
20#include "options/cf_options.h"
21#include "rocksdb/options.h"
22#include "rocksdb/persistent_cache.h"
23#include "rocksdb/statistics.h"
24#include "rocksdb/status.h"
25#include "rocksdb/table.h"
11fdf7f2
TL
26#include "table/block.h"
27#include "table/block_based_table_factory.h"
7c673cae
FG
28#include "table/filter_block.h"
29#include "table/format.h"
30#include "table/persistent_cache_helper.h"
31#include "table/table_properties_internal.h"
32#include "table/table_reader.h"
33#include "table/two_level_iterator.h"
34#include "util/coding.h"
35#include "util/file_reader_writer.h"
494da23a 36#include "util/user_comparator_wrapper.h"
7c673cae
FG
37
38namespace rocksdb {
39
7c673cae
FG
40class BlockHandle;
41class Cache;
42class FilterBlockReader;
43class BlockBasedFilterBlockReader;
44class FullFilterBlockReader;
45class Footer;
46class InternalKeyComparator;
47class Iterator;
48class RandomAccessFile;
49class TableCache;
50class TableReader;
51class WritableFile;
52struct BlockBasedTableOptions;
53struct EnvOptions;
54struct ReadOptions;
55class GetContext;
7c673cae 56
7c673cae
FG
57typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
58
59// A Table is a sorted map from strings to strings. Tables are
60// immutable and persistent. A Table may be safely accessed from
61// multiple threads without external synchronization.
62class BlockBasedTable : public TableReader {
63 public:
64 static const std::string kFilterBlockPrefix;
65 static const std::string kFullFilterBlockPrefix;
66 static const std::string kPartitionedFilterBlockPrefix;
67 // The longest prefix of the cache key used to identify blocks.
68 // For Posix files the unique ID is three varints.
69 static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
70
71 // Attempt to open the table that is stored in bytes [0..file_size)
72 // of "file", and read the metadata entries necessary to allow
73 // retrieving data from the table.
74 //
75 // If successful, returns ok and sets "*table_reader" to the newly opened
76 // table. The client should delete "*table_reader" when no longer needed.
77 // If there was an error while initializing the table, sets "*table_reader"
78 // to nullptr and returns a non-ok status.
79 //
80 // @param file must remain live while this Table is in use.
81 // @param prefetch_index_and_filter_in_cache can be used to disable
82 // prefetching of
83 // index and filter blocks into block cache at startup
84 // @param skip_filters Disables loading/accessing the filter block. Overrides
85 // prefetch_index_and_filter_in_cache, so filter will be skipped if both
86 // are set.
87 static Status Open(const ImmutableCFOptions& ioptions,
88 const EnvOptions& env_options,
89 const BlockBasedTableOptions& table_options,
90 const InternalKeyComparator& internal_key_comparator,
494da23a
TL
91 std::unique_ptr<RandomAccessFileReader>&& file,
92 uint64_t file_size,
93 std::unique_ptr<TableReader>* table_reader,
11fdf7f2 94 const SliceTransform* prefix_extractor = nullptr,
7c673cae 95 bool prefetch_index_and_filter_in_cache = true,
11fdf7f2
TL
96 bool skip_filters = false, int level = -1,
97 const bool immortal_table = false,
98 const SequenceNumber largest_seqno = 0,
99 TailPrefetchStats* tail_prefetch_stats = nullptr);
7c673cae 100
11fdf7f2
TL
101 bool PrefixMayMatch(const Slice& internal_key,
102 const ReadOptions& read_options,
103 const SliceTransform* options_prefix_extractor,
104 const bool need_upper_bound_check);
7c673cae
FG
105
106 // Returns a new iterator over the table contents.
107 // The result of NewIterator() is initially invalid (caller must
108 // call one of the Seek methods on the iterator before using it).
109 // @param skip_filters Disables loading/accessing the filter block
11fdf7f2
TL
110 InternalIterator* NewIterator(const ReadOptions&,
111 const SliceTransform* prefix_extractor,
112 Arena* arena = nullptr,
113 bool skip_filters = false,
114 bool for_compaction = false) override;
7c673cae 115
494da23a 116 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
7c673cae
FG
117 const ReadOptions& read_options) override;
118
119 // @param skip_filters Disables loading/accessing the filter block
120 Status Get(const ReadOptions& readOptions, const Slice& key,
11fdf7f2
TL
121 GetContext* get_context, const SliceTransform* prefix_extractor,
122 bool skip_filters = false) override;
7c673cae
FG
123
124 // Pre-fetch the disk blocks that correspond to the key range specified by
125 // (kbegin, kend). The call will return error status in the event of
126 // IO or iteration error.
127 Status Prefetch(const Slice* begin, const Slice* end) override;
128
129 // Given a key, return an approximate byte offset in the file where
130 // the data for that key begins (or would begin if the key were
131 // present in the file). The returned value is in terms of file
132 // bytes, and so includes effects like compression of the underlying data.
133 // E.g., the approximate offset of the last key in the table will
134 // be close to the file length.
135 uint64_t ApproximateOffsetOf(const Slice& key) override;
136
137 // Returns true if the block for the specified key is in cache.
138 // REQUIRES: key is in this table && block cache enabled
139 bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
140
141 // Set up the table for Compaction. Might change some parameters with
142 // posix_fadvise
143 void SetupForCompaction() override;
144
145 std::shared_ptr<const TableProperties> GetTableProperties() const override;
146
147 size_t ApproximateMemoryUsage() const override;
148
149 // convert SST file to a human readable form
11fdf7f2
TL
150 Status DumpTable(WritableFile* out_file,
151 const SliceTransform* prefix_extractor = nullptr) override;
152
153 Status VerifyChecksum() override;
7c673cae
FG
154
155 void Close() override;
156
157 ~BlockBasedTable();
158
159 bool TEST_filter_block_preloaded() const;
160 bool TEST_index_reader_preloaded() const;
161
162 // IndexReader is the interface that provide the functionality for index
163 // access.
164 class IndexReader {
165 public:
11fdf7f2
TL
166 explicit IndexReader(const InternalKeyComparator* icomparator,
167 Statistics* stats)
168 : icomparator_(icomparator), statistics_(stats) {}
7c673cae
FG
169
170 virtual ~IndexReader() {}
171
172 // Create an iterator for index access.
173 // If iter is null then a new object is created on heap and the callee will
174 // have the ownership. If a non-null iter is passed in it will be used, and
175 // the returned value is either the same as iter or a new on-heap object
176 // that
177 // wrapps the passed iter. In the latter case the return value would point
178 // to
179 // a different object then iter and the callee has the ownership of the
180 // returned object.
11fdf7f2
TL
181 virtual InternalIteratorBase<BlockHandle>* NewIterator(
182 IndexBlockIter* iter = nullptr, bool total_order_seek = true,
183 bool fill_cache = true) = 0;
7c673cae
FG
184
185 // The size of the index.
186 virtual size_t size() const = 0;
187 // Memory usage of the index block
188 virtual size_t usable_size() const = 0;
189 // return the statistics pointer
190 virtual Statistics* statistics() const { return statistics_; }
191 // Report an approximation of how much memory has been used other than
192 // memory
193 // that was allocated in block cache.
194 virtual size_t ApproximateMemoryUsage() const = 0;
195
11fdf7f2
TL
196 virtual void CacheDependencies(bool /* unused */) {}
197
198 // Prefetch all the blocks referenced by this index to the buffer
199 void PrefetchBlocks(FilePrefetchBuffer* buf);
200
7c673cae 201 protected:
11fdf7f2 202 const InternalKeyComparator* icomparator_;
7c673cae
FG
203
204 private:
205 Statistics* statistics_;
206 };
207
208 static Slice GetCacheKey(const char* cache_key_prefix,
209 size_t cache_key_prefix_size,
210 const BlockHandle& handle, char* cache_key);
211
212 // Retrieve all key value pairs from data blocks in the table.
213 // The key retrieved are internal keys.
214 Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
215
11fdf7f2
TL
216 template <class TValue>
217 struct CachableEntry;
218 struct Rep;
219
220 Rep* get_rep() { return rep_; }
221
222 // input_iter: if it is not null, update this one and return it as Iterator
223 template <typename TBlockIter>
224 static TBlockIter* NewDataBlockIterator(
225 Rep* rep, const ReadOptions& ro, const Slice& index_value,
226 TBlockIter* input_iter = nullptr, bool is_index = false,
227 bool key_includes_seq = true, bool index_key_is_full = true,
228 GetContext* get_context = nullptr,
229 FilePrefetchBuffer* prefetch_buffer = nullptr);
230 template <typename TBlockIter>
231 static TBlockIter* NewDataBlockIterator(
232 Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
233 TBlockIter* input_iter = nullptr, bool is_index = false,
234 bool key_includes_seq = true, bool index_key_is_full = true,
235 GetContext* get_context = nullptr, Status s = Status(),
236 FilePrefetchBuffer* prefetch_buffer = nullptr);
237
238 class PartitionedIndexIteratorState;
7c673cae
FG
239
240 friend class PartitionIndexReader;
241
242 protected:
7c673cae 243 Rep* rep_;
11fdf7f2 244 explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
7c673cae
FG
245
246 private:
11fdf7f2
TL
247 friend class MockedBlockBasedTable;
248 static std::atomic<uint64_t> next_cache_key_id_;
7c673cae 249
7c673cae
FG
250 // If block cache enabled (compressed or uncompressed), looks for the block
251 // identified by handle in (1) uncompressed cache, (2) compressed cache, and
252 // then (3) file. If found, inserts into the cache(s) that were searched
253 // unsuccessfully (e.g., if found in file, will add to both uncompressed and
254 // compressed caches if they're enabled).
255 //
256 // @param block_entry value is set to the uncompressed block if found. If
257 // in uncompressed block cache, also sets cache_handle to reference that
258 // block.
494da23a
TL
259 static Status MaybeReadBlockAndLoadToCache(
260 FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
261 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
262 CachableEntry<Block>* block_entry, bool is_index = false,
263 GetContext* get_context = nullptr);
7c673cae
FG
264
265 // For the following two functions:
266 // if `no_io == true`, we will not try to read filter/index from sst file
267 // were they not present in cache yet.
11fdf7f2
TL
268 CachableEntry<FilterBlockReader> GetFilter(
269 const SliceTransform* prefix_extractor = nullptr,
270 FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false,
271 GetContext* get_context = nullptr) const;
7c673cae 272 virtual CachableEntry<FilterBlockReader> GetFilter(
11fdf7f2
TL
273 FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
274 const bool is_a_filter_partition, bool no_io, GetContext* get_context,
275 const SliceTransform* prefix_extractor = nullptr) const;
7c673cae 276
494da23a
TL
277 static CachableEntry<UncompressionDict> GetUncompressionDict(
278 Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
279 GetContext* get_context);
280
7c673cae
FG
281 // Get the iterator from the index reader.
282 // If input_iter is not set, return new Iterator
283 // If input_iter is set, update it and return it as Iterator
284 //
285 // Note: ErrorIterator with Status::Incomplete shall be returned if all the
286 // following conditions are met:
287 // 1. We enabled table_options.cache_index_and_filter_blocks.
288 // 2. index is not present in block cache.
289 // 3. We disallowed any io to be performed, that is, read_options ==
290 // kBlockCacheTier
11fdf7f2
TL
291 InternalIteratorBase<BlockHandle>* NewIndexIterator(
292 const ReadOptions& read_options, bool need_upper_bound_check = false,
293 IndexBlockIter* input_iter = nullptr,
294 CachableEntry<IndexReader>* index_entry = nullptr,
295 GetContext* get_context = nullptr);
7c673cae
FG
296
297 // Read block cache from block caches (if set): block_cache and
298 // block_cache_compressed.
299 // On success, Status::OK with be returned and @block will be populated with
300 // pointer to the block as well as its block handle.
494da23a 301 // @param uncompression_dict Data for presetting the compression library's
7c673cae
FG
302 // dictionary.
303 static Status GetDataBlockFromCache(
304 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
494da23a
TL
305 Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
306 const ReadOptions& read_options,
307 BlockBasedTable::CachableEntry<Block>* block,
308 const UncompressionDict& uncompression_dict,
309 size_t read_amp_bytes_per_bit, bool is_index = false,
310 GetContext* get_context = nullptr);
7c673cae
FG
311
312 // Put a raw block (maybe compressed) to the corresponding block caches.
313 // This method will perform decompression against raw_block if needed and then
314 // populate the block caches.
315 // On success, Status::OK will be returned; also @block will be populated with
316 // uncompressed block and its cache handle.
317 //
494da23a
TL
318 // Allocated memory managed by raw_block_contents will be transferred to
319 // PutDataBlockToCache(). After the call, the object will be invalid.
320 // @param uncompression_dict Data for presetting the compression library's
7c673cae
FG
321 // dictionary.
322 static Status PutDataBlockToCache(
323 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
324 Cache* block_cache, Cache* block_cache_compressed,
325 const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
494da23a
TL
326 CachableEntry<Block>* block, BlockContents* raw_block_contents,
327 CompressionType raw_block_comp_type, uint32_t format_version,
328 const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
329 size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
11fdf7f2
TL
330 bool is_index = false, Cache::Priority pri = Cache::Priority::LOW,
331 GetContext* get_context = nullptr);
7c673cae
FG
332
333 // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
334 // after a call to Seek(key), until handle_result returns false.
335 // May not make such a call if filter policy says that key is not present.
336 friend class TableCache;
337 friend class BlockBasedTableBuilder;
338
339 void ReadMeta(const Footer& footer);
340
11fdf7f2
TL
341 // Figure the index type, update it in rep_, and also return it.
342 BlockBasedTableOptions::IndexType UpdateIndexType();
343
7c673cae
FG
344 // Create a index reader based on the index type stored in the table.
345 // Optionally, user can pass a preloaded meta_index_iter for the index that
346 // need to access extra meta blocks for index construction. This parameter
347 // helps avoid re-reading meta index block if caller already created one.
348 Status CreateIndexReader(
11fdf7f2 349 FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
7c673cae
FG
350 InternalIterator* preloaded_meta_index_iter = nullptr,
351 const int level = -1);
352
11fdf7f2
TL
353 bool FullFilterKeyMayMatch(
354 const ReadOptions& read_options, FilterBlockReader* filter,
355 const Slice& user_key, const bool no_io,
356 const SliceTransform* prefix_extractor = nullptr) const;
7c673cae 357
494da23a
TL
358 static Status PrefetchTail(
359 RandomAccessFileReader* file, uint64_t file_size,
360 TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
361 const bool preload_all,
362 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
11fdf7f2
TL
363 static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer,
364 std::unique_ptr<Block>* meta_block,
7c673cae 365 std::unique_ptr<InternalIterator>* iter);
494da23a
TL
366 static Status TryReadPropertiesWithGlobalSeqno(
367 Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
368 TableProperties** table_properties);
369 static Status ReadPropertiesBlock(Rep* rep,
370 FilePrefetchBuffer* prefetch_buffer,
371 InternalIterator* meta_iter,
372 const SequenceNumber largest_seqno);
373 static Status ReadRangeDelBlock(
374 Rep* rep, FilePrefetchBuffer* prefetch_buffer,
375 InternalIterator* meta_iter,
376 const InternalKeyComparator& internal_comparator);
377 static Status ReadCompressionDictBlock(
378 Rep* rep, FilePrefetchBuffer* prefetch_buffer,
379 std::unique_ptr<const BlockContents>* compression_dict_block);
380 static Status PrefetchIndexAndFilterBlocks(
381 Rep* rep, FilePrefetchBuffer* prefetch_buffer,
382 InternalIterator* meta_iter, BlockBasedTable* new_table,
383 const SliceTransform* prefix_extractor, bool prefetch_all,
384 const BlockBasedTableOptions& table_options, const int level,
385 const bool prefetch_index_and_filter_in_cache);
386
387 Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
11fdf7f2
TL
388 Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
389
7c673cae 390 // Create the filter from the filter block.
11fdf7f2
TL
391 virtual FilterBlockReader* ReadFilter(
392 FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
393 const bool is_a_filter_partition,
394 const SliceTransform* prefix_extractor = nullptr) const;
7c673cae
FG
395
396 static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
397
398 // Generate a cache key prefix from the file
494da23a
TL
399 static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
400 char* buffer, size_t* size);
401 static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
402 size_t* size);
7c673cae
FG
403
404 // Helper functions for DumpTable()
405 Status DumpIndexBlock(WritableFile* out_file);
406 Status DumpDataBlocks(WritableFile* out_file);
407 void DumpKeyValue(const Slice& key, const Slice& value,
408 WritableFile* out_file);
409
410 // No copying allowed
411 explicit BlockBasedTable(const TableReader&) = delete;
412 void operator=(const TableReader&) = delete;
413
414 friend class PartitionedFilterBlockReader;
415 friend class PartitionedFilterBlockTest;
416};
417
418// Maitaning state of a two-level iteration on a partitioned index structure
11fdf7f2
TL
419class BlockBasedTable::PartitionedIndexIteratorState
420 : public TwoLevelIteratorState {
7c673cae 421 public:
11fdf7f2
TL
422 PartitionedIndexIteratorState(
423 BlockBasedTable* table,
424 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
425 const bool index_key_includes_seq, const bool index_key_is_full);
426 InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
427 const BlockHandle& index_value) override;
7c673cae
FG
428
429 private:
430 // Don't own table_
431 BlockBasedTable* table_;
11fdf7f2
TL
432 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
433 bool index_key_includes_seq_;
434 bool index_key_is_full_;
7c673cae
FG
435};
436
437// CachableEntry represents the entries that *may* be fetched from block cache.
438// field `value` is the item we want to get.
439// field `cache_handle` is the cache handle to the block cache. If the value
440// was not read from cache, `cache_handle` will be nullptr.
441template <class TValue>
442struct BlockBasedTable::CachableEntry {
443 CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
444 : value(_value), cache_handle(_cache_handle) {}
445 CachableEntry() : CachableEntry(nullptr, nullptr) {}
11fdf7f2 446 void Release(Cache* cache, bool force_erase = false) {
7c673cae 447 if (cache_handle) {
11fdf7f2 448 cache->Release(cache_handle, force_erase);
7c673cae
FG
449 value = nullptr;
450 cache_handle = nullptr;
451 }
452 }
453 bool IsSet() const { return cache_handle != nullptr; }
454
455 TValue* value = nullptr;
456 // if the entry is from the cache, cache_handle will be populated.
457 Cache::Handle* cache_handle = nullptr;
458};
459
460struct BlockBasedTable::Rep {
461 Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
462 const BlockBasedTableOptions& _table_opt,
11fdf7f2 463 const InternalKeyComparator& _internal_comparator, bool skip_filters,
494da23a 464 int _level, const bool _immortal_table)
7c673cae
FG
465 : ioptions(_ioptions),
466 env_options(_env_options),
467 table_options(_table_opt),
468 filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
469 internal_comparator(_internal_comparator),
470 filter_type(FilterType::kNoFilter),
11fdf7f2
TL
471 index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
472 hash_index_allow_collision(false),
7c673cae
FG
473 whole_key_filtering(_table_opt.whole_key_filtering),
474 prefix_filtering(true),
11fdf7f2 475 global_seqno(kDisableGlobalSequenceNumber),
494da23a 476 level(_level),
11fdf7f2 477 immortal_table(_immortal_table) {}
7c673cae
FG
478
479 const ImmutableCFOptions& ioptions;
480 const EnvOptions& env_options;
11fdf7f2 481 const BlockBasedTableOptions table_options;
7c673cae
FG
482 const FilterPolicy* const filter_policy;
483 const InternalKeyComparator& internal_comparator;
484 Status status;
494da23a 485 std::unique_ptr<RandomAccessFileReader> file;
7c673cae
FG
486 char cache_key_prefix[kMaxCacheKeyPrefixSize];
487 size_t cache_key_prefix_size = 0;
488 char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
489 size_t persistent_cache_key_prefix_size = 0;
490 char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
491 size_t compressed_cache_key_prefix_size = 0;
492 uint64_t dummy_index_reader_offset =
493 0; // ID that is unique for the block cache.
494 PersistentCacheOptions persistent_cache_options;
495
496 // Footer contains the fixed table information
497 Footer footer;
494da23a
TL
498 // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e.,
499 // non-nullptr) and used only when options.block_cache is nullptr or when
500 // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index,
501 // filter, and compression dictionary blocks via the block cache. In that case
502 // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle`
503 // are used to lookup these meta-blocks in block cache.
504 std::unique_ptr<IndexReader> index_reader;
505 std::unique_ptr<FilterBlockReader> filter;
506 std::unique_ptr<UncompressionDict> uncompression_dict;
7c673cae
FG
507
508 enum class FilterType {
509 kNoFilter,
510 kFullFilter,
511 kBlockFilter,
512 kPartitionedFilter,
513 };
514 FilterType filter_type;
515 BlockHandle filter_handle;
494da23a 516 BlockHandle compression_dict_handle;
7c673cae
FG
517
518 std::shared_ptr<const TableProperties> table_properties;
7c673cae
FG
519 BlockBasedTableOptions::IndexType index_type;
520 bool hash_index_allow_collision;
521 bool whole_key_filtering;
522 bool prefix_filtering;
523 // TODO(kailiu) It is very ugly to use internal key in table, since table
524 // module should not be relying on db module. However to make things easier
525 // and compatible with existing code, we introduce a wrapper that allows
526 // block to extract prefix without knowing if a key is internal or not.
494da23a 527 std::unique_ptr<SliceTransform> internal_prefix_transform;
11fdf7f2
TL
528 std::shared_ptr<const SliceTransform> table_prefix_extractor;
529
530 // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
531 // true or in all levels when pin_top_level_index_and_filter is set in
532 // combination with partitioned index/filters: then we do use the LRU cache,
533 // but we always keep the filter & index block's handle checked out here (=we
534 // don't call Release()), plus the parsed out objects the LRU cache will never
535 // push flush them out, hence they're pinned
7c673cae
FG
536 CachableEntry<FilterBlockReader> filter_entry;
537 CachableEntry<IndexReader> index_entry;
494da23a 538 std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
7c673cae
FG
539
540 // If global_seqno is used, all Keys in this file will have the same
541 // seqno with value `global_seqno`.
542 //
543 // A value of kDisableGlobalSequenceNumber means that this feature is disabled
544 // and every key have it's own seqno.
545 SequenceNumber global_seqno;
11fdf7f2 546
494da23a
TL
547 // the level when the table is opened, could potentially change when trivial
548 // move is involved
549 int level;
550
11fdf7f2
TL
551 // If false, blocks in this file are definitely all uncompressed. Knowing this
552 // before reading individual blocks enables certain optimizations.
553 bool blocks_maybe_compressed = true;
554
494da23a
TL
555 // If true, data blocks in this file are definitely ZSTD compressed. If false
556 // they might not be. When false we skip creating a ZSTD digested
557 // uncompression dictionary. Even if we get a false negative, things should
558 // still work, just not as quickly.
559 bool blocks_definitely_zstd_compressed = false;
560
11fdf7f2
TL
561 bool closed = false;
562 const bool immortal_table;
494da23a
TL
563
564 SequenceNumber get_global_seqno(bool is_index) const {
565 return is_index ? kDisableGlobalSequenceNumber : global_seqno;
566 }
11fdf7f2
TL
567};
568
569template <class TBlockIter, typename TValue = Slice>
570class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
571 public:
572 BlockBasedTableIterator(BlockBasedTable* table,
573 const ReadOptions& read_options,
574 const InternalKeyComparator& icomp,
575 InternalIteratorBase<BlockHandle>* index_iter,
576 bool check_filter, bool need_upper_bound_check,
577 const SliceTransform* prefix_extractor, bool is_index,
578 bool key_includes_seq = true,
579 bool index_key_is_full = true,
580 bool for_compaction = false)
581 : table_(table),
582 read_options_(read_options),
583 icomp_(icomp),
494da23a 584 user_comparator_(icomp.user_comparator()),
11fdf7f2
TL
585 index_iter_(index_iter),
586 pinned_iters_mgr_(nullptr),
587 block_iter_points_to_real_block_(false),
588 check_filter_(check_filter),
589 need_upper_bound_check_(need_upper_bound_check),
590 prefix_extractor_(prefix_extractor),
591 is_index_(is_index),
592 key_includes_seq_(key_includes_seq),
593 index_key_is_full_(index_key_is_full),
594 for_compaction_(for_compaction) {}
595
596 ~BlockBasedTableIterator() { delete index_iter_; }
597
598 void Seek(const Slice& target) override;
599 void SeekForPrev(const Slice& target) override;
600 void SeekToFirst() override;
601 void SeekToLast() override;
602 void Next() override;
603 void Prev() override;
604 bool Valid() const override {
605 return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
606 block_iter_.Valid();
607 }
608 Slice key() const override {
609 assert(Valid());
610 return block_iter_.key();
611 }
612 TValue value() const override {
613 assert(Valid());
614 return block_iter_.value();
615 }
616 Status status() const override {
617 if (!index_iter_->status().ok()) {
618 return index_iter_->status();
619 } else if (block_iter_points_to_real_block_) {
620 return block_iter_.status();
621 } else {
622 return Status::OK();
623 }
624 }
625
626 bool IsOutOfBound() override { return is_out_of_bound_; }
627
628 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
629 pinned_iters_mgr_ = pinned_iters_mgr;
630 }
631 bool IsKeyPinned() const override {
632 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
633 block_iter_points_to_real_block_ && block_iter_.IsKeyPinned();
634 }
635 bool IsValuePinned() const override {
636 // BlockIter::IsValuePinned() is always true. No need to check
637 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
638 block_iter_points_to_real_block_;
639 }
640
641 bool CheckPrefixMayMatch(const Slice& ikey) {
642 if (check_filter_ &&
643 !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
644 need_upper_bound_check_)) {
645 // TODO remember the iterator is invalidated because of prefix
646 // match. This can avoid the upper level file iterator to falsely
647 // believe the position is the end of the SST file and move to
648 // the first key of the next file.
649 ResetDataIter();
650 return false;
651 }
652 return true;
653 }
654
655 void ResetDataIter() {
656 if (block_iter_points_to_real_block_) {
657 if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
658 block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
659 }
660 block_iter_.Invalidate(Status::OK());
661 block_iter_points_to_real_block_ = false;
662 }
663 }
664
665 void SavePrevIndexValue() {
666 if (block_iter_points_to_real_block_) {
667 // Reseek. If they end up with the same data block, we shouldn't re-fetch
668 // the same data block.
669 prev_index_value_ = index_iter_->value();
670 }
671 }
672
673 void InitDataBlock();
674 void FindKeyForward();
675 void FindKeyBackward();
676
677 private:
678 BlockBasedTable* table_;
679 const ReadOptions read_options_;
680 const InternalKeyComparator& icomp_;
494da23a 681 UserComparatorWrapper user_comparator_;
11fdf7f2
TL
682 InternalIteratorBase<BlockHandle>* index_iter_;
683 PinnedIteratorsManager* pinned_iters_mgr_;
684 TBlockIter block_iter_;
685 bool block_iter_points_to_real_block_;
686 bool is_out_of_bound_ = false;
687 bool check_filter_;
688 // TODO(Zhongyi): pick a better name
689 bool need_upper_bound_check_;
690 const SliceTransform* prefix_extractor_;
691 // If the blocks over which we iterate are index blocks
692 bool is_index_;
693 // If the keys in the blocks over which we iterate include 8 byte sequence
694 bool key_includes_seq_;
695 bool index_key_is_full_;
696 // If this iterator is created for compaction
697 bool for_compaction_;
698 BlockHandle prev_index_value_;
699
700 static const size_t kInitReadaheadSize = 8 * 1024;
701 // Found that 256 KB readahead size provides the best performance, based on
702 // experiments.
703 static const size_t kMaxReadaheadSize;
704 size_t readahead_size_ = kInitReadaheadSize;
705 size_t readahead_limit_ = 0;
706 int num_file_reads_ = 0;
707 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
7c673cae
FG
708};
709
710} // namespace rocksdb