]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based_table_reader.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / block_based_table_reader.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9#include "table/block_based_table_reader.h"
10
11#include <algorithm>
11fdf7f2 12#include <array>
7c673cae
FG
13#include <limits>
14#include <string>
15#include <utility>
16#include <vector>
17
18#include "db/dbformat.h"
19#include "db/pinned_iterators_manager.h"
20
21#include "rocksdb/cache.h"
22#include "rocksdb/comparator.h"
23#include "rocksdb/env.h"
24#include "rocksdb/filter_policy.h"
25#include "rocksdb/iterator.h"
26#include "rocksdb/options.h"
27#include "rocksdb/statistics.h"
28#include "rocksdb/table.h"
29#include "rocksdb/table_properties.h"
30
31#include "table/block.h"
32#include "table/block_based_filter_block.h"
33#include "table/block_based_table_factory.h"
11fdf7f2 34#include "table/block_fetcher.h"
7c673cae
FG
35#include "table/block_prefix_index.h"
36#include "table/filter_block.h"
37#include "table/format.h"
38#include "table/full_filter_block.h"
39#include "table/get_context.h"
40#include "table/internal_iterator.h"
41#include "table/meta_blocks.h"
42#include "table/partitioned_filter_block.h"
43#include "table/persistent_cache_helper.h"
44#include "table/sst_file_writer_collectors.h"
45#include "table/two_level_iterator.h"
46
47#include "monitoring/perf_context_imp.h"
48#include "util/coding.h"
49#include "util/file_reader_writer.h"
50#include "util/stop_watch.h"
51#include "util/string_util.h"
52#include "util/sync_point.h"
53
54namespace rocksdb {
55
56extern const uint64_t kBlockBasedTableMagicNumber;
57extern const std::string kHashIndexPrefixesBlock;
58extern const std::string kHashIndexPrefixesMetadataBlock;
59using std::unique_ptr;
60
61typedef BlockBasedTable::IndexReader IndexReader;
62
63BlockBasedTable::~BlockBasedTable() {
64 Close();
65 delete rep_;
66}
67
11fdf7f2
TL
68std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
69
7c673cae
FG
70namespace {
71// Read the block identified by "handle" from "file".
72// The only relevant option is options.verify_checksums for now.
73// On failure return non-OK.
74// On success fill *result and return OK - caller owns *result
75// @param compression_dict Data for presetting the compression library's
76// dictionary.
11fdf7f2
TL
77Status ReadBlockFromFile(
78 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
79 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
80 std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
81 bool do_uncompress, const Slice& compression_dict,
82 const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
83 size_t read_amp_bytes_per_bit, const bool immortal_file = false) {
7c673cae 84 BlockContents contents;
11fdf7f2
TL
85 BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
86 &contents, ioptions, do_uncompress,
87 compression_dict, cache_options, immortal_file);
88 Status s = block_fetcher.ReadBlockContents();
7c673cae
FG
89 if (s.ok()) {
90 result->reset(new Block(std::move(contents), global_seqno,
91 read_amp_bytes_per_bit, ioptions.statistics));
92 }
93
94 return s;
95}
96
97// Delete the resource that is held by the iterator.
98template <class ResourceType>
11fdf7f2 99void DeleteHeldResource(void* arg, void* /*ignored*/) {
7c673cae
FG
100 delete reinterpret_cast<ResourceType*>(arg);
101}
102
103// Delete the entry resided in the cache.
104template <class Entry>
11fdf7f2 105void DeleteCachedEntry(const Slice& /*key*/, void* value) {
7c673cae
FG
106 auto entry = reinterpret_cast<Entry*>(value);
107 delete entry;
108}
109
110void DeleteCachedFilterEntry(const Slice& key, void* value);
111void DeleteCachedIndexEntry(const Slice& key, void* value);
112
113// Release the cached entry and decrement its ref count.
114void ReleaseCachedEntry(void* arg, void* h) {
115 Cache* cache = reinterpret_cast<Cache*>(arg);
116 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
117 cache->Release(handle);
118}
119
11fdf7f2
TL
120// Release the cached entry and decrement its ref count.
121void ForceReleaseCachedEntry(void* arg, void* h) {
122 Cache* cache = reinterpret_cast<Cache*>(arg);
123 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
124 cache->Release(handle, true /* force_erase */);
125}
126
7c673cae
FG
127Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
128 size_t cache_key_prefix_size, uint64_t offset,
129 char* cache_key) {
130 assert(cache_key != nullptr);
131 assert(cache_key_prefix_size != 0);
132 assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize);
133 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
134 char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset);
135 return Slice(cache_key, static_cast<size_t>(end - cache_key));
136}
137
138Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
139 Tickers block_cache_miss_ticker,
140 Tickers block_cache_hit_ticker,
11fdf7f2
TL
141 uint64_t* block_cache_miss_stats,
142 uint64_t* block_cache_hit_stats,
143 Statistics* statistics,
144 GetContext* get_context) {
7c673cae
FG
145 auto cache_handle = block_cache->Lookup(key, statistics);
146 if (cache_handle != nullptr) {
147 PERF_COUNTER_ADD(block_cache_hit_count, 1);
11fdf7f2
TL
148 if (get_context != nullptr) {
149 // overall cache hit
150 get_context->get_context_stats_.num_cache_hit++;
151 // total bytes read from cache
152 get_context->get_context_stats_.num_cache_bytes_read +=
153 block_cache->GetUsage(cache_handle);
154 // block-type specific cache hit
155 (*block_cache_hit_stats)++;
156 } else {
157 // overall cache hit
158 RecordTick(statistics, BLOCK_CACHE_HIT);
159 // total bytes read from cache
160 RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
161 block_cache->GetUsage(cache_handle));
162 RecordTick(statistics, block_cache_hit_ticker);
163 }
7c673cae 164 } else {
11fdf7f2
TL
165 if (get_context != nullptr) {
166 // overall cache miss
167 get_context->get_context_stats_.num_cache_miss++;
168 // block-type specific cache miss
169 (*block_cache_miss_stats)++;
170 } else {
171 RecordTick(statistics, BLOCK_CACHE_MISS);
172 RecordTick(statistics, block_cache_miss_ticker);
173 }
7c673cae
FG
174 }
175
176 return cache_handle;
177}
178
11fdf7f2
TL
179// For hash based index, return true if prefix_extractor and
180// prefix_extractor_block mismatch, false otherwise. This flag will be used
181// as total_order_seek via NewIndexIterator
182bool PrefixExtractorChanged(const TableProperties* table_properties,
183 const SliceTransform* prefix_extractor) {
184 // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
185 // Turn off hash index in prefix_extractor is not set; if prefix_extractor
186 // is set but prefix_extractor_block is not set, also disable hash index
187 if (prefix_extractor == nullptr || table_properties == nullptr ||
188 table_properties->prefix_extractor_name.empty()) {
189 return true;
190 }
191
192 // prefix_extractor and prefix_extractor_block are both non-empty
193 if (table_properties->prefix_extractor_name.compare(
194 prefix_extractor->Name()) != 0) {
195 return true;
196 } else {
197 return false;
198 }
199}
200
7c673cae
FG
201} // namespace
202
203// Index that allows binary search lookup in a two-level index structure.
204class PartitionIndexReader : public IndexReader, public Cleanable {
205 public:
206 // Read the partition index from the file and create an instance for
207 // `PartitionIndexReader`.
208 // On success, index_reader will be populated; otherwise it will remain
209 // unmodified.
210 static Status Create(BlockBasedTable* table, RandomAccessFileReader* file,
11fdf7f2 211 FilePrefetchBuffer* prefetch_buffer,
7c673cae
FG
212 const Footer& footer, const BlockHandle& index_handle,
213 const ImmutableCFOptions& ioptions,
11fdf7f2
TL
214 const InternalKeyComparator* icomparator,
215 IndexReader** index_reader,
7c673cae 216 const PersistentCacheOptions& cache_options,
11fdf7f2
TL
217 const int level, const bool index_key_includes_seq,
218 const bool index_value_is_full) {
7c673cae
FG
219 std::unique_ptr<Block> index_block;
220 auto s = ReadBlockFromFile(
11fdf7f2
TL
221 file, prefetch_buffer, footer, ReadOptions(), index_handle,
222 &index_block, ioptions, true /* decompress */,
223 Slice() /*compression dict*/, cache_options,
7c673cae
FG
224 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
225
226 if (s.ok()) {
11fdf7f2
TL
227 *index_reader = new PartitionIndexReader(
228 table, icomparator, std::move(index_block), ioptions.statistics,
229 level, index_key_includes_seq, index_value_is_full);
7c673cae
FG
230 }
231
232 return s;
233 }
234
235 // return a two-level iterator: first level is on the partition index
11fdf7f2
TL
236 virtual InternalIteratorBase<BlockHandle>* NewIterator(
237 IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
238 bool fill_cache = true) override {
239 Statistics* kNullStats = nullptr;
7c673cae 240 // Filters are already checked before seeking the index
11fdf7f2
TL
241 if (!partition_map_.empty()) {
242 return NewTwoLevelIterator(
243 new BlockBasedTable::PartitionedIndexIteratorState(
244 table_, &partition_map_, index_key_includes_seq_,
245 index_value_is_full_),
246 index_block_->NewIterator<IndexBlockIter>(
247 icomparator_, icomparator_->user_comparator(), nullptr,
248 kNullStats, true, index_key_includes_seq_, index_value_is_full_));
249 } else {
250 auto ro = ReadOptions();
251 ro.fill_cache = fill_cache;
252 bool kIsIndex = true;
253 return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
254 table_, ro, *icomparator_,
255 index_block_->NewIterator<IndexBlockIter>(
256 icomparator_, icomparator_->user_comparator(), nullptr,
257 kNullStats, true, index_key_includes_seq_, index_value_is_full_),
258 false, true, /* prefix_extractor */ nullptr, kIsIndex,
259 index_key_includes_seq_, index_value_is_full_);
260 }
7c673cae 261 // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
11fdf7f2
TL
262 // on-stack BlockIter while the state is on heap. Currentlly it assumes
263 // the first level iter is always on heap and will attempt to delete it
264 // in its destructor.
265 }
266
267 virtual void CacheDependencies(bool pin) override {
268 // Before read partitions, prefetch them to avoid lots of IOs
269 auto rep = table_->rep_;
270 IndexBlockIter biter;
271 BlockHandle handle;
272 Statistics* kNullStats = nullptr;
273 index_block_->NewIterator<IndexBlockIter>(
274 icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
275 index_key_includes_seq_, index_value_is_full_);
276 // Index partitions are assumed to be consecuitive. Prefetch them all.
277 // Read the first block offset
278 biter.SeekToFirst();
279 if (!biter.Valid()) {
280 // Empty index.
281 return;
282 }
283 handle = biter.value();
284 uint64_t prefetch_off = handle.offset();
285
286 // Read the last block's offset
287 biter.SeekToLast();
288 if (!biter.Valid()) {
289 // Empty index.
290 return;
291 }
292 handle = biter.value();
293 uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
294 uint64_t prefetch_len = last_off - prefetch_off;
295 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
296 auto& file = table_->rep_->file;
297 prefetch_buffer.reset(new FilePrefetchBuffer());
298 Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
299 static_cast<size_t>(prefetch_len));
300
301 // After prefetch, read the partitions one by one
302 biter.SeekToFirst();
303 auto ro = ReadOptions();
304 Cache* block_cache = rep->table_options.block_cache.get();
305 for (; biter.Valid(); biter.Next()) {
306 handle = biter.value();
307 BlockBasedTable::CachableEntry<Block> block;
308 Slice compression_dict;
309 if (rep->compression_dict_block) {
310 compression_dict = rep->compression_dict_block->data;
311 }
312 const bool is_index = true;
313 // TODO: Support counter batch update for partitioned index and
314 // filter blocks
315 s = table_->MaybeLoadDataBlockToCache(
316 prefetch_buffer.get(), rep, ro, handle, compression_dict, &block,
317 is_index, nullptr /* get_context */);
318
319 assert(s.ok() || block.value == nullptr);
320 if (s.ok() && block.value != nullptr) {
321 if (block.cache_handle != nullptr) {
322 if (pin) {
323 partition_map_[handle.offset()] = block;
324 RegisterCleanup(&ReleaseCachedEntry, block_cache,
325 block.cache_handle);
326 } else {
327 block_cache->Release(block.cache_handle);
328 }
329 } else {
330 delete block.value;
331 }
332 }
333 }
7c673cae
FG
334 }
335
336 virtual size_t size() const override { return index_block_->size(); }
337 virtual size_t usable_size() const override {
338 return index_block_->usable_size();
339 }
340
341 virtual size_t ApproximateMemoryUsage() const override {
342 assert(index_block_);
11fdf7f2
TL
343 size_t usage = index_block_->ApproximateMemoryUsage();
344#ifdef ROCKSDB_MALLOC_USABLE_SIZE
345 usage += malloc_usable_size((void*)this);
346#else
347 usage += sizeof(*this);
348#endif // ROCKSDB_MALLOC_USABLE_SIZE
349 // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
350 return usage;
7c673cae
FG
351 }
352
353 private:
11fdf7f2
TL
354 PartitionIndexReader(BlockBasedTable* table,
355 const InternalKeyComparator* icomparator,
7c673cae 356 std::unique_ptr<Block>&& index_block, Statistics* stats,
11fdf7f2
TL
357 const int /*level*/, const bool index_key_includes_seq,
358 const bool index_value_is_full)
359 : IndexReader(icomparator, stats),
7c673cae
FG
360 table_(table),
361 index_block_(std::move(index_block)),
11fdf7f2
TL
362 index_key_includes_seq_(index_key_includes_seq),
363 index_value_is_full_(index_value_is_full) {
7c673cae
FG
364 assert(index_block_ != nullptr);
365 }
366 BlockBasedTable* table_;
367 std::unique_ptr<Block> index_block_;
11fdf7f2
TL
368 std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
369 partition_map_;
370 const bool index_key_includes_seq_;
371 const bool index_value_is_full_;
7c673cae
FG
372};
373
374// Index that allows binary search lookup for the first key of each block.
375// This class can be viewed as a thin wrapper for `Block` class which already
376// supports binary search.
377class BinarySearchIndexReader : public IndexReader {
378 public:
379 // Read index from the file and create an intance for
380 // `BinarySearchIndexReader`.
381 // On success, index_reader will be populated; otherwise it will remain
382 // unmodified.
11fdf7f2
TL
383 static Status Create(RandomAccessFileReader* file,
384 FilePrefetchBuffer* prefetch_buffer,
385 const Footer& footer, const BlockHandle& index_handle,
386 const ImmutableCFOptions& ioptions,
387 const InternalKeyComparator* icomparator,
388 IndexReader** index_reader,
389 const PersistentCacheOptions& cache_options,
390 const bool index_key_includes_seq,
391 const bool index_value_is_full) {
7c673cae
FG
392 std::unique_ptr<Block> index_block;
393 auto s = ReadBlockFromFile(
11fdf7f2
TL
394 file, prefetch_buffer, footer, ReadOptions(), index_handle,
395 &index_block, ioptions, true /* decompress */,
396 Slice() /*compression dict*/, cache_options,
7c673cae
FG
397 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
398
399 if (s.ok()) {
400 *index_reader = new BinarySearchIndexReader(
11fdf7f2
TL
401 icomparator, std::move(index_block), ioptions.statistics,
402 index_key_includes_seq, index_value_is_full);
7c673cae
FG
403 }
404
405 return s;
406 }
407
11fdf7f2
TL
408 virtual InternalIteratorBase<BlockHandle>* NewIterator(
409 IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
410 bool /*dont_care*/ = true) override {
411 Statistics* kNullStats = nullptr;
412 return index_block_->NewIterator<IndexBlockIter>(
413 icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
414 index_key_includes_seq_, index_value_is_full_);
7c673cae
FG
415 }
416
417 virtual size_t size() const override { return index_block_->size(); }
418 virtual size_t usable_size() const override {
419 return index_block_->usable_size();
420 }
421
422 virtual size_t ApproximateMemoryUsage() const override {
423 assert(index_block_);
11fdf7f2
TL
424 size_t usage = index_block_->ApproximateMemoryUsage();
425#ifdef ROCKSDB_MALLOC_USABLE_SIZE
426 usage += malloc_usable_size((void*)this);
427#else
428 usage += sizeof(*this);
429#endif // ROCKSDB_MALLOC_USABLE_SIZE
430 return usage;
7c673cae
FG
431 }
432
433 private:
11fdf7f2 434 BinarySearchIndexReader(const InternalKeyComparator* icomparator,
7c673cae 435 std::unique_ptr<Block>&& index_block,
11fdf7f2
TL
436 Statistics* stats, const bool index_key_includes_seq,
437 const bool index_value_is_full)
438 : IndexReader(icomparator, stats),
439 index_block_(std::move(index_block)),
440 index_key_includes_seq_(index_key_includes_seq),
441 index_value_is_full_(index_value_is_full) {
7c673cae
FG
442 assert(index_block_ != nullptr);
443 }
444 std::unique_ptr<Block> index_block_;
11fdf7f2
TL
445 const bool index_key_includes_seq_;
446 const bool index_value_is_full_;
7c673cae
FG
447};
448
449// Index that leverages an internal hash table to quicken the lookup for a given
450// key.
451class HashIndexReader : public IndexReader {
452 public:
11fdf7f2
TL
453 static Status Create(
454 const SliceTransform* hash_key_extractor, const Footer& footer,
455 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
456 const ImmutableCFOptions& ioptions,
457 const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
458 InternalIterator* meta_index_iter, IndexReader** index_reader,
459 bool /*hash_index_allow_collision*/,
460 const PersistentCacheOptions& cache_options,
461 const bool index_key_includes_seq, const bool index_value_is_full) {
7c673cae
FG
462 std::unique_ptr<Block> index_block;
463 auto s = ReadBlockFromFile(
11fdf7f2
TL
464 file, prefetch_buffer, footer, ReadOptions(), index_handle,
465 &index_block, ioptions, true /* decompress */,
466 Slice() /*compression dict*/, cache_options,
7c673cae
FG
467 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
468
469 if (!s.ok()) {
470 return s;
471 }
472
473 // Note, failure to create prefix hash index does not need to be a
474 // hard error. We can still fall back to the original binary search index.
475 // So, Create will succeed regardless, from this point on.
476
11fdf7f2
TL
477 auto new_index_reader = new HashIndexReader(
478 icomparator, std::move(index_block), ioptions.statistics,
479 index_key_includes_seq, index_value_is_full);
7c673cae
FG
480 *index_reader = new_index_reader;
481
482 // Get prefixes block
483 BlockHandle prefixes_handle;
484 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
485 &prefixes_handle);
486 if (!s.ok()) {
487 // TODO: log error
488 return Status::OK();
489 }
490
491 // Get index metadata block
492 BlockHandle prefixes_meta_handle;
493 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
494 &prefixes_meta_handle);
495 if (!s.ok()) {
496 // TODO: log error
497 return Status::OK();
498 }
499
11fdf7f2 500 Slice dummy_comp_dict;
7c673cae
FG
501 // Read contents for the blocks
502 BlockContents prefixes_contents;
11fdf7f2
TL
503 BlockFetcher prefixes_block_fetcher(
504 file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
505 &prefixes_contents, ioptions, true /* decompress */,
506 dummy_comp_dict /*compression dict*/, cache_options);
507 s = prefixes_block_fetcher.ReadBlockContents();
7c673cae
FG
508 if (!s.ok()) {
509 return s;
510 }
511 BlockContents prefixes_meta_contents;
11fdf7f2
TL
512 BlockFetcher prefixes_meta_block_fetcher(
513 file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
514 &prefixes_meta_contents, ioptions, true /* decompress */,
515 dummy_comp_dict /*compression dict*/, cache_options);
516 s = prefixes_meta_block_fetcher.ReadBlockContents();
7c673cae
FG
517 if (!s.ok()) {
518 // TODO: log error
519 return Status::OK();
520 }
521
522 BlockPrefixIndex* prefix_index = nullptr;
523 s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data,
524 prefixes_meta_contents.data, &prefix_index);
525 // TODO: log error
526 if (s.ok()) {
11fdf7f2 527 new_index_reader->prefix_index_.reset(prefix_index);
7c673cae
FG
528 }
529
530 return Status::OK();
531 }
532
11fdf7f2
TL
533 virtual InternalIteratorBase<BlockHandle>* NewIterator(
534 IndexBlockIter* iter = nullptr, bool total_order_seek = true,
535 bool /*dont_care*/ = true) override {
536 Statistics* kNullStats = nullptr;
537 return index_block_->NewIterator<IndexBlockIter>(
538 icomparator_, icomparator_->user_comparator(), iter, kNullStats,
539 total_order_seek, index_key_includes_seq_, index_value_is_full_,
540 prefix_index_.get());
7c673cae
FG
541 }
542
543 virtual size_t size() const override { return index_block_->size(); }
544 virtual size_t usable_size() const override {
545 return index_block_->usable_size();
546 }
547
548 virtual size_t ApproximateMemoryUsage() const override {
549 assert(index_block_);
11fdf7f2
TL
550 size_t usage = index_block_->ApproximateMemoryUsage();
551 usage += prefixes_contents_.usable_size();
552#ifdef ROCKSDB_MALLOC_USABLE_SIZE
553 usage += malloc_usable_size((void*)this);
554#else
555 if (prefix_index_) {
556 usage += prefix_index_->ApproximateMemoryUsage();
557 }
558 usage += sizeof(*this);
559#endif // ROCKSDB_MALLOC_USABLE_SIZE
560 return usage;
7c673cae
FG
561 }
562
563 private:
11fdf7f2
TL
564 HashIndexReader(const InternalKeyComparator* icomparator,
565 std::unique_ptr<Block>&& index_block, Statistics* stats,
566 const bool index_key_includes_seq,
567 const bool index_value_is_full)
568 : IndexReader(icomparator, stats),
569 index_block_(std::move(index_block)),
570 index_key_includes_seq_(index_key_includes_seq),
571 index_value_is_full_(index_value_is_full) {
7c673cae
FG
572 assert(index_block_ != nullptr);
573 }
574
575 ~HashIndexReader() {
576 }
577
578 std::unique_ptr<Block> index_block_;
11fdf7f2 579 std::unique_ptr<BlockPrefixIndex> prefix_index_;
7c673cae 580 BlockContents prefixes_contents_;
11fdf7f2
TL
581 const bool index_key_includes_seq_;
582 const bool index_value_is_full_;
7c673cae
FG
583};
584
585// Helper function to setup the cache key's prefix for the Table.
586void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
587 assert(kMaxCacheKeyPrefixSize >= 10);
588 rep->cache_key_prefix_size = 0;
589 rep->compressed_cache_key_prefix_size = 0;
590 if (rep->table_options.block_cache != nullptr) {
591 GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
592 &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
593 // Create dummy offset of index reader which is beyond the file size.
594 rep->dummy_index_reader_offset =
595 file_size + rep->table_options.block_cache->NewId();
596 }
597 if (rep->table_options.persistent_cache != nullptr) {
598 GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
599 &rep->persistent_cache_key_prefix[0],
600 &rep->persistent_cache_key_prefix_size);
601 }
602 if (rep->table_options.block_cache_compressed != nullptr) {
603 GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
604 rep->file->file(), &rep->compressed_cache_key_prefix[0],
605 &rep->compressed_cache_key_prefix_size);
606 }
607}
608
609void BlockBasedTable::GenerateCachePrefix(Cache* cc,
610 RandomAccessFile* file, char* buffer, size_t* size) {
611
612 // generate an id from the file
613 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
614
615 // If the prefix wasn't generated or was too long,
616 // create one from the cache.
617 if (cc && *size == 0) {
618 char* end = EncodeVarint64(buffer, cc->NewId());
619 *size = static_cast<size_t>(end - buffer);
620 }
621}
622
623void BlockBasedTable::GenerateCachePrefix(Cache* cc,
624 WritableFile* file, char* buffer, size_t* size) {
625
626 // generate an id from the file
627 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
628
629 // If the prefix wasn't generated or was too long,
630 // create one from the cache.
631 if (*size == 0) {
632 char* end = EncodeVarint64(buffer, cc->NewId());
633 *size = static_cast<size_t>(end - buffer);
634 }
635}
636
637namespace {
638// Return True if table_properties has `user_prop_name` has a `true` value
639// or it doesn't contain this property (for backward compatible).
640bool IsFeatureSupported(const TableProperties& table_properties,
641 const std::string& user_prop_name, Logger* info_log) {
642 auto& props = table_properties.user_collected_properties;
643 auto pos = props.find(user_prop_name);
644 // Older version doesn't have this value set. Skip this check.
645 if (pos != props.end()) {
646 if (pos->second == kPropFalse) {
647 return false;
648 } else if (pos->second != kPropTrue) {
649 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
650 user_prop_name.c_str(), pos->second.c_str());
651 }
652 }
653 return true;
654}
655
11fdf7f2
TL
656// Caller has to ensure seqno is not nullptr.
657Status GetGlobalSequenceNumber(const TableProperties& table_properties,
658 SequenceNumber largest_seqno,
659 SequenceNumber* seqno) {
660 const auto& props = table_properties.user_collected_properties;
661 const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
662 const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
7c673cae 663
11fdf7f2 664 *seqno = kDisableGlobalSequenceNumber;
7c673cae
FG
665 if (version_pos == props.end()) {
666 if (seqno_pos != props.end()) {
11fdf7f2 667 std::array<char, 200> msg_buf;
7c673cae 668 // This is not an external sst file, global_seqno is not supported.
11fdf7f2
TL
669 snprintf(
670 msg_buf.data(), msg_buf.max_size(),
7c673cae
FG
671 "A non-external sst file have global seqno property with value %s",
672 seqno_pos->second.c_str());
11fdf7f2 673 return Status::Corruption(msg_buf.data());
7c673cae 674 }
11fdf7f2 675 return Status::OK();
7c673cae
FG
676 }
677
678 uint32_t version = DecodeFixed32(version_pos->second.c_str());
679 if (version < 2) {
680 if (seqno_pos != props.end() || version != 1) {
11fdf7f2 681 std::array<char, 200> msg_buf;
7c673cae 682 // This is a v1 external sst file, global_seqno is not supported.
11fdf7f2
TL
683 snprintf(msg_buf.data(), msg_buf.max_size(),
684 "An external sst file with version %u have global seqno "
685 "property with value %s",
686 version, seqno_pos->second.c_str());
687 return Status::Corruption(msg_buf.data());
7c673cae 688 }
11fdf7f2 689 return Status::OK();
7c673cae
FG
690 }
691
11fdf7f2
TL
692 // Since we have a plan to deprecate global_seqno, we do not return failure
693 // if seqno_pos == props.end(). We rely on version_pos to detect whether the
694 // SST is external.
695 SequenceNumber global_seqno(0);
696 if (seqno_pos != props.end()) {
697 global_seqno = DecodeFixed64(seqno_pos->second.c_str());
698 }
699 if (global_seqno != 0 && global_seqno != largest_seqno) {
700 std::array<char, 200> msg_buf;
701 snprintf(msg_buf.data(), msg_buf.max_size(),
702 "An external sst file with version %u have global seqno property "
703 "with value %s, while largest seqno in the file is %llu",
704 version, seqno_pos->second.c_str(),
705 static_cast<unsigned long long>(largest_seqno));
706 return Status::Corruption(msg_buf.data());
707 }
708 global_seqno = largest_seqno;
709 *seqno = largest_seqno;
7c673cae
FG
710
711 if (global_seqno > kMaxSequenceNumber) {
11fdf7f2
TL
712 std::array<char, 200> msg_buf;
713 snprintf(msg_buf.data(), msg_buf.max_size(),
714 "An external sst file with version %u have global seqno property "
715 "with value %llu, which is greater than kMaxSequenceNumber",
716 version, static_cast<unsigned long long>(global_seqno));
717 return Status::Corruption(msg_buf.data());
7c673cae
FG
718 }
719
11fdf7f2 720 return Status::OK();
7c673cae
FG
721}
722} // namespace
723
724Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
725 size_t cache_key_prefix_size,
726 const BlockHandle& handle, char* cache_key) {
727 assert(cache_key != nullptr);
728 assert(cache_key_prefix_size != 0);
729 assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
730 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
731 char* end =
732 EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
733 return Slice(cache_key, static_cast<size_t>(end - cache_key));
734}
735
736Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
737 const EnvOptions& env_options,
738 const BlockBasedTableOptions& table_options,
739 const InternalKeyComparator& internal_comparator,
740 unique_ptr<RandomAccessFileReader>&& file,
741 uint64_t file_size,
742 unique_ptr<TableReader>* table_reader,
11fdf7f2 743 const SliceTransform* prefix_extractor,
7c673cae 744 const bool prefetch_index_and_filter_in_cache,
11fdf7f2
TL
745 const bool skip_filters, const int level,
746 const bool immortal_table,
747 const SequenceNumber largest_seqno,
748 TailPrefetchStats* tail_prefetch_stats) {
7c673cae
FG
749 table_reader->reset();
750
751 Footer footer;
752
11fdf7f2
TL
753 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
754
755 // prefetch both index and filters, down to all partitions
756 const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
757 const bool preload_all = !table_options.cache_index_and_filter_blocks;
758
759 size_t tail_prefetch_size = 0;
760 if (tail_prefetch_stats != nullptr) {
761 // Multiple threads may get a 0 (no history) when running in parallel,
762 // but it will get cleared after the first of them finishes.
763 tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
764 }
765 if (tail_prefetch_size == 0) {
766 // Before read footer, readahead backwards to prefetch data. Do more
767 // readahead if we're going to read index/filter.
768 // TODO: This may incorrectly select small readahead in case partitioned
769 // index/filter is enabled and top-level partition pinning is enabled.
770 // That's because we need to issue readahead before we read the properties,
771 // at which point we don't yet know the index type.
772 tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
773 }
774 size_t prefetch_off;
775 size_t prefetch_len;
776 if (file_size < tail_prefetch_size) {
777 prefetch_off = 0;
778 prefetch_len = static_cast<size_t>(file_size);
779 } else {
780 prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
781 prefetch_len = tail_prefetch_size;
782 }
783 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
784 &tail_prefetch_size);
785 Status s;
786 // TODO should not have this special logic in the future.
787 if (!file->use_direct_io()) {
788 prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
789 s = file->Prefetch(prefetch_off, prefetch_len);
790 } else {
791 prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
792 s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
793 }
794 s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
795 kBlockBasedTableMagicNumber);
7c673cae
FG
796 if (!s.ok()) {
797 return s;
798 }
799 if (!BlockBasedTableSupportedVersion(footer.version())) {
800 return Status::Corruption(
801 "Unknown Footer version. Maybe this file was created with newer "
802 "version of RocksDB?");
803 }
804
805 // We've successfully read the footer. We are ready to serve requests.
806 // Better not mutate rep_ after the creation. eg. internal_prefix_transform
807 // raw pointer will be used to create HashIndexReader, whose reset may
808 // access a dangling pointer.
809 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
11fdf7f2
TL
810 internal_comparator, skip_filters,
811 immortal_table);
7c673cae
FG
812 rep->file = std::move(file);
813 rep->footer = footer;
814 rep->index_type = table_options.index_type;
815 rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
816 // We need to wrap data with internal_prefix_transform to make sure it can
817 // handle prefix correctly.
818 rep->internal_prefix_transform.reset(
11fdf7f2 819 new InternalKeySliceTransform(prefix_extractor));
7c673cae
FG
820 SetupCacheKeyPrefix(rep, file_size);
821 unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
822
823 // page cache options
824 rep->persistent_cache_options =
825 PersistentCacheOptions(rep->table_options.persistent_cache,
826 std::string(rep->persistent_cache_key_prefix,
827 rep->persistent_cache_key_prefix_size),
828 rep->ioptions.statistics);
829
830 // Read meta index
831 std::unique_ptr<Block> meta;
832 std::unique_ptr<InternalIterator> meta_iter;
11fdf7f2 833 s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter);
7c673cae
FG
834 if (!s.ok()) {
835 return s;
836 }
837
838 // Find filter handle and filter type
839 if (rep->filter_policy) {
840 for (auto filter_type :
841 {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
842 Rep::FilterType::kBlockFilter}) {
843 std::string prefix;
844 switch (filter_type) {
845 case Rep::FilterType::kFullFilter:
846 prefix = kFullFilterBlockPrefix;
847 break;
848 case Rep::FilterType::kPartitionedFilter:
849 prefix = kPartitionedFilterBlockPrefix;
850 break;
851 case Rep::FilterType::kBlockFilter:
852 prefix = kFilterBlockPrefix;
853 break;
854 default:
855 assert(0);
856 }
857 std::string filter_block_key = prefix;
858 filter_block_key.append(rep->filter_policy->Name());
859 if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle)
860 .ok()) {
861 rep->filter_type = filter_type;
862 break;
863 }
864 }
865 }
866
867 // Read the properties
868 bool found_properties_block = true;
869 s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
870
871 if (!s.ok()) {
872 ROCKS_LOG_WARN(rep->ioptions.info_log,
873 "Error when seeking to properties block from file: %s",
874 s.ToString().c_str());
875 } else if (found_properties_block) {
876 s = meta_iter->status();
877 TableProperties* table_properties = nullptr;
878 if (s.ok()) {
11fdf7f2
TL
879 s = ReadProperties(meta_iter->value(), rep->file.get(),
880 prefetch_buffer.get(), rep->footer, rep->ioptions,
881 &table_properties, false /* compression_type_missing */);
7c673cae
FG
882 }
883
884 if (!s.ok()) {
885 ROCKS_LOG_WARN(rep->ioptions.info_log,
886 "Encountered error while reading data from properties "
887 "block %s",
888 s.ToString().c_str());
889 } else {
11fdf7f2 890 assert(table_properties != nullptr);
7c673cae 891 rep->table_properties.reset(table_properties);
11fdf7f2
TL
892 rep->blocks_maybe_compressed = rep->table_properties->compression_name !=
893 CompressionTypeToString(kNoCompression);
7c673cae
FG
894 }
895 } else {
896 ROCKS_LOG_ERROR(rep->ioptions.info_log,
897 "Cannot find Properties block from file.");
898 }
11fdf7f2
TL
899#ifndef ROCKSDB_LITE
900 if (rep->table_properties) {
901 ParseSliceTransform(rep->table_properties->prefix_extractor_name,
902 &(rep->table_prefix_extractor));
903 }
904#endif // ROCKSDB_LITE
7c673cae
FG
905
906 // Read the compression dictionary meta block
907 bool found_compression_dict;
11fdf7f2
TL
908 BlockHandle compression_dict_handle;
909 s = SeekToCompressionDictBlock(meta_iter.get(), &found_compression_dict,
910 &compression_dict_handle);
7c673cae
FG
911 if (!s.ok()) {
912 ROCKS_LOG_WARN(
913 rep->ioptions.info_log,
914 "Error when seeking to compression dictionary block from file: %s",
915 s.ToString().c_str());
11fdf7f2 916 } else if (found_compression_dict && !compression_dict_handle.IsNull()) {
7c673cae
FG
917 // TODO(andrewkr): Add to block cache if cache_index_and_filter_blocks is
918 // true.
11fdf7f2
TL
919 std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
920 PersistentCacheOptions cache_options;
921 ReadOptions read_options;
922 read_options.verify_checksums = false;
923 BlockFetcher compression_block_fetcher(
924 rep->file.get(), prefetch_buffer.get(), rep->footer, read_options,
925 compression_dict_handle, compression_dict_cont.get(), rep->ioptions, false /* decompress */,
926 Slice() /*compression dict*/, cache_options);
927 s = compression_block_fetcher.ReadBlockContents();
928
7c673cae
FG
929 if (!s.ok()) {
930 ROCKS_LOG_WARN(
931 rep->ioptions.info_log,
932 "Encountered error while reading data from compression dictionary "
933 "block %s",
934 s.ToString().c_str());
935 } else {
11fdf7f2
TL
936 rep->compression_dict_block = std::move(compression_dict_cont);
937 }
938 }
939
940 // Read the table properties, if provided.
941 if (rep->table_properties) {
942 rep->whole_key_filtering &=
943 IsFeatureSupported(*(rep->table_properties),
944 BlockBasedTablePropertyNames::kWholeKeyFiltering,
945 rep->ioptions.info_log);
946 rep->prefix_filtering &= IsFeatureSupported(
947 *(rep->table_properties),
948 BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
949
950 s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno,
951 &(rep->global_seqno));
952 if (!s.ok()) {
953 ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str());
954 return s;
7c673cae
FG
955 }
956 }
957
958 // Read the range del meta block
959 bool found_range_del_block;
960 s = SeekToRangeDelBlock(meta_iter.get(), &found_range_del_block,
961 &rep->range_del_handle);
962 if (!s.ok()) {
963 ROCKS_LOG_WARN(
964 rep->ioptions.info_log,
965 "Error when seeking to range delete tombstones block from file: %s",
966 s.ToString().c_str());
967 } else {
968 if (found_range_del_block && !rep->range_del_handle.IsNull()) {
969 ReadOptions read_options;
11fdf7f2
TL
970 s = MaybeLoadDataBlockToCache(
971 prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
972 Slice() /* compression_dict */, &rep->range_del_entry,
973 false /* is_index */, nullptr /* get_context */);
7c673cae
FG
974 if (!s.ok()) {
975 ROCKS_LOG_WARN(
976 rep->ioptions.info_log,
977 "Encountered error while reading data from range del block %s",
978 s.ToString().c_str());
979 }
980 }
981 }
982
11fdf7f2
TL
983 bool need_upper_bound_check =
984 PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor);
985
986 BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
987 // prefetch the first level of index
988 const bool prefetch_index =
989 prefetch_all ||
990 (table_options.pin_top_level_index_and_filter &&
991 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
992 // prefetch the first level of filter
993 const bool prefetch_filter =
994 prefetch_all || (table_options.pin_top_level_index_and_filter &&
995 rep->filter_type == Rep::FilterType::kPartitionedFilter);
996 // Partition fitlers cannot be enabled without partition indexes
997 assert(!prefetch_filter || prefetch_index);
998 // pin both index and filters, down to all partitions
999 const bool pin_all =
1000 rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1001 // pin the first level of index
1002 const bool pin_index =
1003 pin_all || (table_options.pin_top_level_index_and_filter &&
1004 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
1005 // pin the first level of filter
1006 const bool pin_filter =
1007 pin_all || (table_options.pin_top_level_index_and_filter &&
1008 rep->filter_type == Rep::FilterType::kPartitionedFilter);
1009 // pre-fetching of blocks is turned on
7c673cae
FG
1010 // Will use block cache for index/filter blocks access
1011 // Always prefetch index and filter for level 0
1012 if (table_options.cache_index_and_filter_blocks) {
11fdf7f2
TL
1013 assert(table_options.block_cache != nullptr);
1014 if (prefetch_index) {
7c673cae
FG
1015 // Hack: Call NewIndexIterator() to implicitly add index to the
1016 // block_cache
11fdf7f2
TL
1017 CachableEntry<IndexReader> index_entry;
1018 // check prefix_extractor match only if hash based index is used
1019 bool disable_prefix_seek =
1020 rep->index_type == BlockBasedTableOptions::kHashSearch &&
1021 need_upper_bound_check;
1022 unique_ptr<InternalIteratorBase<BlockHandle>> iter(
1023 new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
1024 nullptr, &index_entry));
7c673cae 1025 s = iter->status();
7c673cae 1026 if (s.ok()) {
11fdf7f2
TL
1027 // This is the first call to NewIndexIterator() since we're in Open().
1028 // On success it should give us ownership of the `CachableEntry` by
1029 // populating `index_entry`.
1030 assert(index_entry.value != nullptr);
1031 if (prefetch_all) {
1032 index_entry.value->CacheDependencies(pin_all);
1033 }
1034 if (pin_index) {
1035 rep->index_entry = std::move(index_entry);
7c673cae 1036 } else {
11fdf7f2 1037 index_entry.Release(table_options.block_cache.get());
7c673cae
FG
1038 }
1039 }
1040 }
11fdf7f2
TL
1041 if (s.ok() && prefetch_filter) {
1042 // Hack: Call GetFilter() to implicitly add filter to the block_cache
1043 auto filter_entry =
1044 new_table->GetFilter(rep->table_prefix_extractor.get());
1045 if (filter_entry.value != nullptr && prefetch_all) {
1046 filter_entry.value->CacheDependencies(
1047 pin_all, rep->table_prefix_extractor.get());
1048 }
1049 // if pin_filter is true then save it in rep_->filter_entry; it will be
1050 // released in the destructor only, hence it will be pinned in the
1051 // cache while this reader is alive
1052 if (pin_filter) {
1053 rep->filter_entry = filter_entry;
1054 } else {
1055 filter_entry.Release(table_options.block_cache.get());
1056 }
1057 }
7c673cae
FG
1058 } else {
1059 // If we don't use block cache for index/filter blocks access, we'll
1060 // pre-load these blocks, which will kept in member variables in Rep
1061 // and with a same life-time as this table object.
1062 IndexReader* index_reader = nullptr;
11fdf7f2
TL
1063 s = new_table->CreateIndexReader(prefetch_buffer.get(), &index_reader,
1064 meta_iter.get(), level);
7c673cae
FG
1065 if (s.ok()) {
1066 rep->index_reader.reset(index_reader);
11fdf7f2
TL
1067 // The partitions of partitioned index are always stored in cache. They
1068 // are hence follow the configuration for pin and prefetch regardless of
1069 // the value of cache_index_and_filter_blocks
1070 if (prefetch_index_and_filter_in_cache || level == 0) {
1071 rep->index_reader->CacheDependencies(pin_all);
1072 }
7c673cae
FG
1073
1074 // Set filter block
1075 if (rep->filter_policy) {
1076 const bool is_a_filter_partition = true;
11fdf7f2
TL
1077 auto filter = new_table->ReadFilter(
1078 prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition,
1079 rep->table_prefix_extractor.get());
1080 rep->filter.reset(filter);
1081 // Refer to the comment above about paritioned indexes always being
1082 // cached
1083 if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
1084 filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
7c673cae
FG
1085 }
1086 }
1087 } else {
1088 delete index_reader;
1089 }
1090 }
1091
1092 if (s.ok()) {
11fdf7f2
TL
1093 assert(prefetch_buffer.get() != nullptr);
1094 if (tail_prefetch_stats != nullptr) {
1095 assert(prefetch_buffer->min_offset_read() < file_size);
1096 tail_prefetch_stats->RecordEffectiveSize(
1097 static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
1098 }
7c673cae
FG
1099 *table_reader = std::move(new_table);
1100 }
1101
1102 return s;
1103}
1104
1105void BlockBasedTable::SetupForCompaction() {
1106 switch (rep_->ioptions.access_hint_on_compaction_start) {
1107 case Options::NONE:
1108 break;
1109 case Options::NORMAL:
1110 rep_->file->file()->Hint(RandomAccessFile::NORMAL);
1111 break;
1112 case Options::SEQUENTIAL:
1113 rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
1114 break;
1115 case Options::WILLNEED:
1116 rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
1117 break;
1118 default:
1119 assert(false);
1120 }
7c673cae
FG
1121}
1122
1123std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1124 const {
1125 return rep_->table_properties;
1126}
1127
1128size_t BlockBasedTable::ApproximateMemoryUsage() const {
1129 size_t usage = 0;
1130 if (rep_->filter) {
1131 usage += rep_->filter->ApproximateMemoryUsage();
1132 }
1133 if (rep_->index_reader) {
1134 usage += rep_->index_reader->ApproximateMemoryUsage();
1135 }
1136 return usage;
1137}
1138
1139// Load the meta-block from the file. On success, return the loaded meta block
1140// and its iterator.
1141Status BlockBasedTable::ReadMetaBlock(Rep* rep,
11fdf7f2 1142 FilePrefetchBuffer* prefetch_buffer,
7c673cae
FG
1143 std::unique_ptr<Block>* meta_block,
1144 std::unique_ptr<InternalIterator>* iter) {
1145 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1146 // it is an empty block.
7c673cae
FG
1147 std::unique_ptr<Block> meta;
1148 Status s = ReadBlockFromFile(
11fdf7f2 1149 rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
7c673cae
FG
1150 rep->footer.metaindex_handle(), &meta, rep->ioptions,
1151 true /* decompress */, Slice() /*compression dict*/,
1152 rep->persistent_cache_options, kDisableGlobalSequenceNumber,
1153 0 /* read_amp_bytes_per_bit */);
1154
1155 if (!s.ok()) {
1156 ROCKS_LOG_ERROR(rep->ioptions.info_log,
1157 "Encountered error while reading data from properties"
1158 " block %s",
1159 s.ToString().c_str());
1160 return s;
1161 }
1162
1163 *meta_block = std::move(meta);
1164 // meta block uses bytewise comparator.
11fdf7f2
TL
1165 iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
1166 BytewiseComparator(), BytewiseComparator()));
7c673cae
FG
1167 return Status::OK();
1168}
1169
1170Status BlockBasedTable::GetDataBlockFromCache(
1171 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1172 Cache* block_cache, Cache* block_cache_compressed,
1173 const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
1174 BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
11fdf7f2
TL
1175 const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
1176 GetContext* get_context) {
7c673cae
FG
1177 Status s;
1178 Block* compressed_block = nullptr;
1179 Cache::Handle* block_cache_compressed_handle = nullptr;
1180 Statistics* statistics = ioptions.statistics;
1181
1182 // Lookup uncompressed cache first
1183 if (block_cache != nullptr) {
1184 block->cache_handle = GetEntryFromCache(
1185 block_cache, block_cache_key,
1186 is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
11fdf7f2
TL
1187 is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
1188 get_context
1189 ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
1190 : &get_context->get_context_stats_.num_cache_data_miss)
1191 : nullptr,
1192 get_context
1193 ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
1194 : &get_context->get_context_stats_.num_cache_data_hit)
1195 : nullptr,
1196 statistics, get_context);
7c673cae
FG
1197 if (block->cache_handle != nullptr) {
1198 block->value =
1199 reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
1200 return s;
1201 }
1202 }
1203
1204 // If not found, search from the compressed block cache.
1205 assert(block->cache_handle == nullptr && block->value == nullptr);
1206
1207 if (block_cache_compressed == nullptr) {
1208 return s;
1209 }
1210
1211 assert(!compressed_block_cache_key.empty());
1212 block_cache_compressed_handle =
1213 block_cache_compressed->Lookup(compressed_block_cache_key);
1214 // if we found in the compressed cache, then uncompress and insert into
1215 // uncompressed cache
1216 if (block_cache_compressed_handle == nullptr) {
1217 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1218 return s;
1219 }
1220
1221 // found compressed block
1222 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1223 compressed_block = reinterpret_cast<Block*>(
1224 block_cache_compressed->Value(block_cache_compressed_handle));
1225 assert(compressed_block->compression_type() != kNoCompression);
1226
1227 // Retrieve the uncompressed contents into a new buffer
1228 BlockContents contents;
11fdf7f2
TL
1229 UncompressionContext uncompresssion_ctx(compressed_block->compression_type(),
1230 compression_dict);
1231 s = UncompressBlockContents(uncompresssion_ctx, compressed_block->data(),
7c673cae 1232 compressed_block->size(), &contents,
11fdf7f2 1233 format_version, ioptions);
7c673cae
FG
1234
1235 // Insert uncompressed block into block cache
1236 if (s.ok()) {
1237 block->value =
1238 new Block(std::move(contents), compressed_block->global_seqno(),
1239 read_amp_bytes_per_bit,
1240 statistics); // uncompressed block
1241 assert(block->value->compression_type() == kNoCompression);
1242 if (block_cache != nullptr && block->value->cachable() &&
1243 read_options.fill_cache) {
11fdf7f2
TL
1244 size_t charge = block->value->ApproximateMemoryUsage();
1245 s = block_cache->Insert(block_cache_key, block->value, charge,
1246 &DeleteCachedEntry<Block>,
1247 &(block->cache_handle));
1248 block_cache->TEST_mark_as_data_block(block_cache_key, charge);
7c673cae 1249 if (s.ok()) {
11fdf7f2
TL
1250 if (get_context != nullptr) {
1251 get_context->get_context_stats_.num_cache_add++;
1252 get_context->get_context_stats_.num_cache_bytes_write += charge;
1253 } else {
1254 RecordTick(statistics, BLOCK_CACHE_ADD);
1255 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
1256 }
7c673cae 1257 if (is_index) {
11fdf7f2
TL
1258 if (get_context != nullptr) {
1259 get_context->get_context_stats_.num_cache_index_add++;
1260 get_context->get_context_stats_.num_cache_index_bytes_insert +=
1261 charge;
1262 } else {
1263 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
1264 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
1265 }
7c673cae 1266 } else {
11fdf7f2
TL
1267 if (get_context != nullptr) {
1268 get_context->get_context_stats_.num_cache_data_add++;
1269 get_context->get_context_stats_.num_cache_data_bytes_insert +=
1270 charge;
1271 } else {
1272 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
1273 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
1274 }
7c673cae 1275 }
7c673cae
FG
1276 } else {
1277 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1278 delete block->value;
1279 block->value = nullptr;
1280 }
1281 }
1282 }
1283
1284 // Release hold on compressed cache entry
1285 block_cache_compressed->Release(block_cache_compressed_handle);
1286 return s;
1287}
1288
1289Status BlockBasedTable::PutDataBlockToCache(
1290 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1291 Cache* block_cache, Cache* block_cache_compressed,
11fdf7f2 1292 const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions,
7c673cae
FG
1293 CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
1294 const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
11fdf7f2 1295 Cache::Priority priority, GetContext* get_context) {
7c673cae
FG
1296 assert(raw_block->compression_type() == kNoCompression ||
1297 block_cache_compressed != nullptr);
1298
1299 Status s;
1300 // Retrieve the uncompressed contents into a new buffer
1301 BlockContents contents;
1302 Statistics* statistics = ioptions.statistics;
1303 if (raw_block->compression_type() != kNoCompression) {
11fdf7f2
TL
1304 UncompressionContext uncompression_ctx(raw_block->compression_type(),
1305 compression_dict);
1306 s = UncompressBlockContents(uncompression_ctx, raw_block->data(),
1307 raw_block->size(), &contents, format_version,
1308 ioptions);
7c673cae
FG
1309 }
1310 if (!s.ok()) {
1311 delete raw_block;
1312 return s;
1313 }
1314
1315 if (raw_block->compression_type() != kNoCompression) {
1316 block->value = new Block(std::move(contents), raw_block->global_seqno(),
1317 read_amp_bytes_per_bit,
1318 statistics); // uncompressed block
1319 } else {
1320 block->value = raw_block;
1321 raw_block = nullptr;
1322 }
1323
1324 // Insert compressed block into compressed block cache.
1325 // Release the hold on the compressed cache entry immediately.
1326 if (block_cache_compressed != nullptr && raw_block != nullptr &&
1327 raw_block->cachable()) {
1328 s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block,
11fdf7f2 1329 raw_block->ApproximateMemoryUsage(),
7c673cae
FG
1330 &DeleteCachedEntry<Block>);
1331 if (s.ok()) {
1332 // Avoid the following code to delete this cached block.
1333 raw_block = nullptr;
1334 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1335 } else {
1336 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1337 }
1338 }
1339 delete raw_block;
1340
1341 // insert into uncompressed block cache
1342 assert((block->value->compression_type() == kNoCompression));
1343 if (block_cache != nullptr && block->value->cachable()) {
11fdf7f2
TL
1344 size_t charge = block->value->ApproximateMemoryUsage();
1345 s = block_cache->Insert(block_cache_key, block->value, charge,
1346 &DeleteCachedEntry<Block>, &(block->cache_handle),
1347 priority);
1348 block_cache->TEST_mark_as_data_block(block_cache_key, charge);
7c673cae
FG
1349 if (s.ok()) {
1350 assert(block->cache_handle != nullptr);
11fdf7f2
TL
1351 if (get_context != nullptr) {
1352 get_context->get_context_stats_.num_cache_add++;
1353 get_context->get_context_stats_.num_cache_bytes_write += charge;
1354 } else {
1355 RecordTick(statistics, BLOCK_CACHE_ADD);
1356 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
1357 }
7c673cae 1358 if (is_index) {
11fdf7f2
TL
1359 if (get_context != nullptr) {
1360 get_context->get_context_stats_.num_cache_index_add++;
1361 get_context->get_context_stats_.num_cache_index_bytes_insert +=
1362 charge;
1363 } else {
1364 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
1365 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
1366 }
7c673cae 1367 } else {
11fdf7f2
TL
1368 if (get_context != nullptr) {
1369 get_context->get_context_stats_.num_cache_data_add++;
1370 get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
1371 } else {
1372 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
1373 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
1374 }
7c673cae 1375 }
7c673cae
FG
1376 assert(reinterpret_cast<Block*>(
1377 block_cache->Value(block->cache_handle)) == block->value);
1378 } else {
1379 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1380 delete block->value;
1381 block->value = nullptr;
1382 }
1383 }
1384
1385 return s;
1386}
1387
1388FilterBlockReader* BlockBasedTable::ReadFilter(
11fdf7f2
TL
1389 FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
1390 const bool is_a_filter_partition,
1391 const SliceTransform* prefix_extractor) const {
7c673cae
FG
1392 auto& rep = rep_;
1393 // TODO: We might want to unify with ReadBlockFromFile() if we start
1394 // requiring checksum verification in Table::Open.
1395 if (rep->filter_type == Rep::FilterType::kNoFilter) {
1396 return nullptr;
1397 }
1398 BlockContents block;
11fdf7f2
TL
1399
1400 Slice dummy_comp_dict;
1401
1402 BlockFetcher block_fetcher(rep->file.get(), prefetch_buffer, rep->footer,
1403 ReadOptions(), filter_handle, &block,
1404 rep->ioptions, false /* decompress */,
1405 dummy_comp_dict, rep->persistent_cache_options);
1406 Status s = block_fetcher.ReadBlockContents();
1407
1408 if (!s.ok()) {
7c673cae
FG
1409 // Error reading the block
1410 return nullptr;
1411 }
1412
1413 assert(rep->filter_policy);
1414
1415 auto filter_type = rep->filter_type;
1416 if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
1417 is_a_filter_partition) {
1418 filter_type = Rep::FilterType::kFullFilter;
1419 }
1420
1421 switch (filter_type) {
1422 case Rep::FilterType::kPartitionedFilter: {
1423 return new PartitionedFilterBlockReader(
11fdf7f2 1424 rep->prefix_filtering ? prefix_extractor : nullptr,
7c673cae 1425 rep->whole_key_filtering, std::move(block), nullptr,
11fdf7f2
TL
1426 rep->ioptions.statistics, rep->internal_comparator, this,
1427 rep_->table_properties == nullptr ||
1428 rep_->table_properties->index_key_is_user_key == 0,
1429 rep_->table_properties == nullptr ||
1430 rep_->table_properties->index_value_is_delta_encoded == 0);
7c673cae
FG
1431 }
1432
1433 case Rep::FilterType::kBlockFilter:
1434 return new BlockBasedFilterBlockReader(
11fdf7f2 1435 rep->prefix_filtering ? prefix_extractor : nullptr,
7c673cae
FG
1436 rep->table_options, rep->whole_key_filtering, std::move(block),
1437 rep->ioptions.statistics);
1438
1439 case Rep::FilterType::kFullFilter: {
1440 auto filter_bits_reader =
1441 rep->filter_policy->GetFilterBitsReader(block.data);
1442 assert(filter_bits_reader != nullptr);
1443 return new FullFilterBlockReader(
11fdf7f2 1444 rep->prefix_filtering ? prefix_extractor : nullptr,
7c673cae
FG
1445 rep->whole_key_filtering, std::move(block), filter_bits_reader,
1446 rep->ioptions.statistics);
1447 }
1448
1449 default:
1450 // filter_type is either kNoFilter (exited the function at the first if),
1451 // or it must be covered in this switch block
1452 assert(false);
1453 return nullptr;
1454 }
1455}
1456
1457BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
11fdf7f2
TL
1458 const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
1459 bool no_io, GetContext* get_context) const {
7c673cae
FG
1460 const BlockHandle& filter_blk_handle = rep_->filter_handle;
1461 const bool is_a_filter_partition = true;
11fdf7f2
TL
1462 return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
1463 no_io, get_context, prefix_extractor);
7c673cae
FG
1464}
1465
1466BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
11fdf7f2
TL
1467 FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
1468 const bool is_a_filter_partition, bool no_io, GetContext* get_context,
1469 const SliceTransform* prefix_extractor) const {
7c673cae
FG
1470 // If cache_index_and_filter_blocks is false, filter should be pre-populated.
1471 // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
1472 // read fails at Open() time. We don't want to reload again since it will
1473 // most probably fail again.
1474 if (!is_a_filter_partition &&
1475 !rep_->table_options.cache_index_and_filter_blocks) {
1476 return {rep_->filter.get(), nullptr /* cache handle */};
1477 }
1478
1479 Cache* block_cache = rep_->table_options.block_cache.get();
1480 if (rep_->filter_policy == nullptr /* do not use filter */ ||
1481 block_cache == nullptr /* no block cache at all */) {
1482 return {nullptr /* filter */, nullptr /* cache handle */};
1483 }
1484
1485 if (!is_a_filter_partition && rep_->filter_entry.IsSet()) {
1486 return rep_->filter_entry;
1487 }
1488
1489 PERF_TIMER_GUARD(read_filter_block_nanos);
1490
1491 // Fetching from the cache
1492 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1493 auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1494 filter_blk_handle, cache_key);
1495
1496 Statistics* statistics = rep_->ioptions.statistics;
11fdf7f2
TL
1497 auto cache_handle = GetEntryFromCache(
1498 block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT,
1499 get_context ? &get_context->get_context_stats_.num_cache_filter_miss
1500 : nullptr,
1501 get_context ? &get_context->get_context_stats_.num_cache_filter_hit
1502 : nullptr,
1503 statistics, get_context);
7c673cae
FG
1504
1505 FilterBlockReader* filter = nullptr;
1506 if (cache_handle != nullptr) {
1507 filter = reinterpret_cast<FilterBlockReader*>(
1508 block_cache->Value(cache_handle));
1509 } else if (no_io) {
1510 // Do not invoke any io.
1511 return CachableEntry<FilterBlockReader>();
1512 } else {
11fdf7f2
TL
1513 filter = ReadFilter(prefetch_buffer, filter_blk_handle,
1514 is_a_filter_partition, prefix_extractor);
7c673cae 1515 if (filter != nullptr) {
11fdf7f2 1516 size_t usage = filter->ApproximateMemoryUsage();
7c673cae 1517 Status s = block_cache->Insert(
11fdf7f2 1518 key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
7c673cae
FG
1519 rep_->table_options.cache_index_and_filter_blocks_with_high_priority
1520 ? Cache::Priority::HIGH
1521 : Cache::Priority::LOW);
1522 if (s.ok()) {
11fdf7f2
TL
1523 if (get_context != nullptr) {
1524 get_context->get_context_stats_.num_cache_add++;
1525 get_context->get_context_stats_.num_cache_bytes_write += usage;
1526 get_context->get_context_stats_.num_cache_filter_add++;
1527 get_context->get_context_stats_.num_cache_filter_bytes_insert +=
1528 usage;
1529 } else {
1530 RecordTick(statistics, BLOCK_CACHE_ADD);
1531 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
1532 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
1533 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
1534 }
7c673cae
FG
1535 } else {
1536 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1537 delete filter;
1538 return CachableEntry<FilterBlockReader>();
1539 }
1540 }
1541 }
1542
1543 return { filter, cache_handle };
1544}
1545
11fdf7f2
TL
1546// disable_prefix_seek should be set to true when prefix_extractor found in SST
1547// differs from the one in mutable_cf_options and index type is HashBasedIndex
1548InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
1549 const ReadOptions& read_options, bool disable_prefix_seek,
1550 IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
1551 GetContext* get_context) {
7c673cae
FG
1552 // index reader has already been pre-populated.
1553 if (rep_->index_reader) {
1554 return rep_->index_reader->NewIterator(
11fdf7f2
TL
1555 input_iter, read_options.total_order_seek || disable_prefix_seek,
1556 read_options.fill_cache);
7c673cae
FG
1557 }
1558 // we have a pinned index block
1559 if (rep_->index_entry.IsSet()) {
11fdf7f2
TL
1560 return rep_->index_entry.value->NewIterator(
1561 input_iter, read_options.total_order_seek || disable_prefix_seek,
1562 read_options.fill_cache);
7c673cae
FG
1563 }
1564
1565 PERF_TIMER_GUARD(read_index_block_nanos);
1566
1567 const bool no_io = read_options.read_tier == kBlockCacheTier;
1568 Cache* block_cache = rep_->table_options.block_cache.get();
1569 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1570 auto key =
1571 GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1572 rep_->dummy_index_reader_offset, cache_key);
1573 Statistics* statistics = rep_->ioptions.statistics;
11fdf7f2
TL
1574 auto cache_handle = GetEntryFromCache(
1575 block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT,
1576 get_context ? &get_context->get_context_stats_.num_cache_index_miss
1577 : nullptr,
1578 get_context ? &get_context->get_context_stats_.num_cache_index_hit
1579 : nullptr,
1580 statistics, get_context);
7c673cae
FG
1581
1582 if (cache_handle == nullptr && no_io) {
1583 if (input_iter != nullptr) {
11fdf7f2 1584 input_iter->Invalidate(Status::Incomplete("no blocking io"));
7c673cae
FG
1585 return input_iter;
1586 } else {
11fdf7f2
TL
1587 return NewErrorInternalIterator<BlockHandle>(
1588 Status::Incomplete("no blocking io"));
7c673cae
FG
1589 }
1590 }
1591
1592 IndexReader* index_reader = nullptr;
1593 if (cache_handle != nullptr) {
1594 index_reader =
1595 reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
1596 } else {
1597 // Create index reader and put it in the cache.
1598 Status s;
1599 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2");
11fdf7f2 1600 s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader);
7c673cae
FG
1601 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
1602 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
1603 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
11fdf7f2 1604 size_t charge = 0;
7c673cae
FG
1605 if (s.ok()) {
1606 assert(index_reader != nullptr);
11fdf7f2 1607 charge = index_reader->ApproximateMemoryUsage();
7c673cae 1608 s = block_cache->Insert(
11fdf7f2 1609 key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle,
7c673cae
FG
1610 rep_->table_options.cache_index_and_filter_blocks_with_high_priority
1611 ? Cache::Priority::HIGH
1612 : Cache::Priority::LOW);
1613 }
1614
1615 if (s.ok()) {
11fdf7f2
TL
1616 if (get_context != nullptr) {
1617 get_context->get_context_stats_.num_cache_add++;
1618 get_context->get_context_stats_.num_cache_bytes_write += charge;
1619 } else {
1620 RecordTick(statistics, BLOCK_CACHE_ADD);
1621 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
1622 }
7c673cae 1623 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
11fdf7f2 1624 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
7c673cae
FG
1625 } else {
1626 if (index_reader != nullptr) {
1627 delete index_reader;
1628 }
1629 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1630 // make sure if something goes wrong, index_reader shall remain intact.
1631 if (input_iter != nullptr) {
11fdf7f2 1632 input_iter->Invalidate(s);
7c673cae
FG
1633 return input_iter;
1634 } else {
11fdf7f2 1635 return NewErrorInternalIterator<BlockHandle>(s);
7c673cae
FG
1636 }
1637 }
1638
1639 }
1640
1641 assert(cache_handle);
1642 auto* iter = index_reader->NewIterator(
11fdf7f2 1643 input_iter, read_options.total_order_seek || disable_prefix_seek);
7c673cae
FG
1644
1645 // the caller would like to take ownership of the index block
1646 // don't call RegisterCleanup() in this case, the caller will take care of it
1647 if (index_entry != nullptr) {
1648 *index_entry = {index_reader, cache_handle};
1649 } else {
1650 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
1651 }
1652
1653 return iter;
1654}
1655
7c673cae
FG
1656// Convert an index iterator value (i.e., an encoded BlockHandle)
1657// into an iterator over the contents of the corresponding block.
1658// If input_iter is null, new a iterator
1659// If input_iter is not null, update this iter and return it
11fdf7f2
TL
1660template <typename TBlockIter>
1661TBlockIter* BlockBasedTable::NewDataBlockIterator(
7c673cae 1662 Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
11fdf7f2
TL
1663 TBlockIter* input_iter, bool is_index, bool key_includes_seq,
1664 bool index_key_is_full, GetContext* get_context, Status s,
1665 FilePrefetchBuffer* prefetch_buffer) {
7c673cae
FG
1666 PERF_TIMER_GUARD(new_table_block_iter_nanos);
1667
1668 const bool no_io = (ro.read_tier == kBlockCacheTier);
1669 Cache* block_cache = rep->table_options.block_cache.get();
1670 CachableEntry<Block> block;
1671 Slice compression_dict;
1672 if (s.ok()) {
1673 if (rep->compression_dict_block) {
1674 compression_dict = rep->compression_dict_block->data;
1675 }
11fdf7f2
TL
1676 s = MaybeLoadDataBlockToCache(prefetch_buffer, rep, ro, handle,
1677 compression_dict, &block, is_index,
1678 get_context);
7c673cae
FG
1679 }
1680
11fdf7f2
TL
1681 TBlockIter* iter;
1682 if (input_iter != nullptr) {
1683 iter = input_iter;
1684 } else {
1685 iter = new TBlockIter;
1686 }
7c673cae
FG
1687 // Didn't get any data from block caches.
1688 if (s.ok() && block.value == nullptr) {
1689 if (no_io) {
1690 // Could not read from block_cache and can't do IO
11fdf7f2
TL
1691 iter->Invalidate(Status::Incomplete("no blocking io"));
1692 return iter;
7c673cae
FG
1693 }
1694 std::unique_ptr<Block> block_value;
11fdf7f2
TL
1695 {
1696 StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
1697 READ_BLOCK_GET_MICROS);
1698 s = ReadBlockFromFile(
1699 rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
1700 &block_value, rep->ioptions, rep->blocks_maybe_compressed,
1701 compression_dict, rep->persistent_cache_options,
1702 is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
1703 rep->table_options.read_amp_bytes_per_bit, rep->immortal_table);
1704 }
7c673cae
FG
1705 if (s.ok()) {
1706 block.value = block_value.release();
1707 }
1708 }
1709
7c673cae
FG
1710 if (s.ok()) {
1711 assert(block.value != nullptr);
11fdf7f2
TL
1712 const bool kTotalOrderSeek = true;
1713 iter = block.value->NewIterator<TBlockIter>(
1714 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1715 iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
1716 index_key_is_full);
7c673cae
FG
1717 if (block.cache_handle != nullptr) {
1718 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
1719 block.cache_handle);
1720 } else {
11fdf7f2
TL
1721 if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
1722 // insert a dummy record to block cache to track the memory usage
1723 Cache::Handle* cache_handle;
1724 // There are two other types of cache keys: 1) SST cache key added in
1725 // `MaybeLoadDataBlockToCache` 2) dummy cache key added in
1726 // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
1727 // from SST cache key(31 bytes), and use non-zero prefix to
1728 // differentiate from `write_buffer_manager`
1729 const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
1730 char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
1731 // Prefix: use rep->cache_key_prefix padded by 0s
1732 memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
1733 assert(rep->cache_key_prefix_size != 0);
1734 assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
1735 memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
1736 char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
1737 next_cache_key_id_++);
1738 assert(end - cache_key <=
1739 static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
1740 Slice unique_key =
1741 Slice(cache_key, static_cast<size_t>(end - cache_key));
1742 s = block_cache->Insert(unique_key, nullptr,
1743 block.value->ApproximateMemoryUsage(), nullptr,
1744 &cache_handle);
1745 if (s.ok()) {
1746 if (cache_handle != nullptr) {
1747 iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
1748 cache_handle);
1749 }
1750 }
1751 }
7c673cae
FG
1752 iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
1753 }
1754 } else {
1755 assert(block.value == nullptr);
11fdf7f2 1756 iter->Invalidate(s);
7c673cae
FG
1757 }
1758 return iter;
1759}
1760
1761Status BlockBasedTable::MaybeLoadDataBlockToCache(
11fdf7f2
TL
1762 FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
1763 const BlockHandle& handle, Slice compression_dict,
1764 CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
1765 assert(block_entry != nullptr);
7c673cae
FG
1766 const bool no_io = (ro.read_tier == kBlockCacheTier);
1767 Cache* block_cache = rep->table_options.block_cache.get();
1768 Cache* block_cache_compressed =
1769 rep->table_options.block_cache_compressed.get();
1770
1771 // If either block cache is enabled, we'll try to read from it.
1772 Status s;
1773 if (block_cache != nullptr || block_cache_compressed != nullptr) {
1774 Statistics* statistics = rep->ioptions.statistics;
1775 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1776 char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1777 Slice key, /* key to the block cache */
1778 ckey /* key to the compressed block cache */;
1779
1780 // create key for block cache
1781 if (block_cache != nullptr) {
1782 key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
1783 handle, cache_key);
1784 }
1785
1786 if (block_cache_compressed != nullptr) {
1787 ckey = GetCacheKey(rep->compressed_cache_key_prefix,
1788 rep->compressed_cache_key_prefix_size, handle,
1789 compressed_cache_key);
1790 }
1791
1792 s = GetDataBlockFromCache(
1793 key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
1794 block_entry, rep->table_options.format_version, compression_dict,
11fdf7f2 1795 rep->table_options.read_amp_bytes_per_bit, is_index, get_context);
7c673cae
FG
1796
1797 if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
1798 std::unique_ptr<Block> raw_block;
1799 {
1800 StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
1801 s = ReadBlockFromFile(
11fdf7f2
TL
1802 rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
1803 &raw_block, rep->ioptions,
1804 block_cache_compressed == nullptr && rep->blocks_maybe_compressed,
1805 compression_dict, rep->persistent_cache_options,
1806 is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
1807 rep->table_options.read_amp_bytes_per_bit, rep->immortal_table);
7c673cae
FG
1808 }
1809
1810 if (s.ok()) {
1811 s = PutDataBlockToCache(
1812 key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
1813 block_entry, raw_block.release(), rep->table_options.format_version,
1814 compression_dict, rep->table_options.read_amp_bytes_per_bit,
1815 is_index,
11fdf7f2
TL
1816 is_index && rep->table_options
1817 .cache_index_and_filter_blocks_with_high_priority
7c673cae 1818 ? Cache::Priority::HIGH
11fdf7f2
TL
1819 : Cache::Priority::LOW,
1820 get_context);
7c673cae
FG
1821 }
1822 }
1823 }
11fdf7f2 1824 assert(s.ok() || block_entry->value == nullptr);
7c673cae
FG
1825 return s;
1826}
1827
11fdf7f2
TL
1828BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
1829 BlockBasedTable* table,
1830 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
1831 bool index_key_includes_seq, bool index_key_is_full)
1832 : table_(table),
1833 block_map_(block_map),
1834 index_key_includes_seq_(index_key_includes_seq),
1835 index_key_is_full_(index_key_is_full) {}
1836
1837template <class TBlockIter, typename TValue>
1838const size_t BlockBasedTableIterator<TBlockIter, TValue>::kMaxReadaheadSize =
1839 256 * 1024;
1840
1841InternalIteratorBase<BlockHandle>*
1842BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
1843 const BlockHandle& handle) {
7c673cae 1844 // Return a block iterator on the index partition
11fdf7f2
TL
1845 auto rep = table_->get_rep();
1846 auto block = block_map_->find(handle.offset());
1847 // This is a possible scenario since block cache might not have had space
1848 // for the partition
1849 if (block != block_map_->end()) {
1850 PERF_COUNTER_ADD(block_cache_hit_count, 1);
1851 RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
1852 RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
1853 Cache* block_cache = rep->table_options.block_cache.get();
1854 assert(block_cache);
1855 RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
1856 block_cache->GetUsage(block->second.cache_handle));
1857 Statistics* kNullStats = nullptr;
1858 return block->second.value->NewIterator<IndexBlockIter>(
1859 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1860 nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
1861 }
1862 // Create an empty iterator
1863 return new IndexBlockIter();
7c673cae
FG
1864}
1865
1866// This will be broken if the user specifies an unusual implementation
1867// of Options.comparator, or if the user specifies an unusual
1868// definition of prefixes in BlockBasedTableOptions.filter_policy.
1869// In particular, we require the following three properties:
1870//
1871// 1) key.starts_with(prefix(key))
1872// 2) Compare(prefix(key), key) <= 0.
1873// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
1874//
1875// Otherwise, this method guarantees no I/O will be incurred.
1876//
1877// REQUIRES: this method shouldn't be called while the DB lock is held.
11fdf7f2
TL
1878bool BlockBasedTable::PrefixMayMatch(
1879 const Slice& internal_key, const ReadOptions& read_options,
1880 const SliceTransform* options_prefix_extractor,
1881 const bool need_upper_bound_check) {
7c673cae
FG
1882 if (!rep_->filter_policy) {
1883 return true;
1884 }
1885
11fdf7f2
TL
1886 const SliceTransform* prefix_extractor;
1887
1888 if (rep_->table_prefix_extractor == nullptr) {
1889 if (need_upper_bound_check) {
1890 return true;
1891 }
1892 prefix_extractor = options_prefix_extractor;
1893 } else {
1894 prefix_extractor = rep_->table_prefix_extractor.get();
1895 }
7c673cae 1896 auto user_key = ExtractUserKey(internal_key);
11fdf7f2 1897 if (!prefix_extractor->InDomain(user_key)) {
7c673cae
FG
1898 return true;
1899 }
7c673cae
FG
1900
1901 bool may_match = true;
1902 Status s;
1903
1904 // First, try check with full filter
11fdf7f2 1905 auto filter_entry = GetFilter(prefix_extractor);
7c673cae 1906 FilterBlockReader* filter = filter_entry.value;
11fdf7f2 1907 bool filter_checked = true;
7c673cae
FG
1908 if (filter != nullptr) {
1909 if (!filter->IsBlockBased()) {
1910 const Slice* const const_ikey_ptr = &internal_key;
11fdf7f2
TL
1911 may_match = filter->RangeMayExist(
1912 read_options.iterate_upper_bound, user_key, prefix_extractor,
1913 rep_->internal_comparator.user_comparator(), const_ikey_ptr,
1914 &filter_checked, need_upper_bound_check);
7c673cae 1915 } else {
11fdf7f2
TL
1916 // if prefix_extractor changed for block based filter, skip filter
1917 if (need_upper_bound_check) {
1918 if (!rep_->filter_entry.IsSet()) {
1919 filter_entry.Release(rep_->table_options.block_cache.get());
1920 }
1921 return true;
1922 }
1923 auto prefix = prefix_extractor->Transform(user_key);
7c673cae
FG
1924 InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
1925 auto internal_prefix = internal_key_prefix.Encode();
1926
1927 // To prevent any io operation in this method, we set `read_tier` to make
1928 // sure we always read index or filter only when they have already been
1929 // loaded to memory.
1930 ReadOptions no_io_read_options;
1931 no_io_read_options.read_tier = kBlockCacheTier;
1932
1933 // Then, try find it within each block
11fdf7f2
TL
1934 // we already know prefix_extractor and prefix_extractor_name must match
1935 // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
1936 unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
1937 NewIndexIterator(no_io_read_options,
1938 /* need_upper_bound_check */ false));
7c673cae
FG
1939 iiter->Seek(internal_prefix);
1940
1941 if (!iiter->Valid()) {
1942 // we're past end of file
1943 // if it's incomplete, it means that we avoided I/O
1944 // and we're not really sure that we're past the end
1945 // of the file
1946 may_match = iiter->status().IsIncomplete();
11fdf7f2
TL
1947 } else if ((rep_->table_properties &&
1948 rep_->table_properties->index_key_is_user_key
1949 ? iiter->key()
1950 : ExtractUserKey(iiter->key()))
7c673cae
FG
1951 .starts_with(ExtractUserKey(internal_prefix))) {
1952 // we need to check for this subtle case because our only
1953 // guarantee is that "the key is a string >= last key in that data
1954 // block" according to the doc/table_format.txt spec.
1955 //
1956 // Suppose iiter->key() starts with the desired prefix; it is not
1957 // necessarily the case that the corresponding data block will
1958 // contain the prefix, since iiter->key() need not be in the
1959 // block. However, the next data block may contain the prefix, so
1960 // we return true to play it safe.
1961 may_match = true;
1962 } else if (filter->IsBlockBased()) {
1963 // iiter->key() does NOT start with the desired prefix. Because
1964 // Seek() finds the first key that is >= the seek target, this
1965 // means that iiter->key() > prefix. Thus, any data blocks coming
1966 // after the data block corresponding to iiter->key() cannot
1967 // possibly contain the key. Thus, the corresponding data block
1968 // is the only on could potentially contain the prefix.
11fdf7f2
TL
1969 BlockHandle handle = iiter->value();
1970 may_match =
1971 filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
7c673cae
FG
1972 }
1973 }
1974 }
1975
11fdf7f2
TL
1976 if (filter_checked) {
1977 Statistics* statistics = rep_->ioptions.statistics;
1978 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
1979 if (!may_match) {
1980 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
1981 }
7c673cae
FG
1982 }
1983
1984 // if rep_->filter_entry is not set, we should call Release(); otherwise
1985 // don't call, in this case we have a local copy in rep_->filter_entry,
1986 // it's pinned to the cache and will be released in the destructor
1987 if (!rep_->filter_entry.IsSet()) {
1988 filter_entry.Release(rep_->table_options.block_cache.get());
1989 }
7c673cae
FG
1990 return may_match;
1991}
1992
11fdf7f2
TL
1993template <class TBlockIter, typename TValue>
1994void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
1995 is_out_of_bound_ = false;
1996 if (!CheckPrefixMayMatch(target)) {
1997 ResetDataIter();
1998 return;
1999 }
2000
2001 SavePrevIndexValue();
2002
2003 index_iter_->Seek(target);
2004
2005 if (!index_iter_->Valid()) {
2006 ResetDataIter();
2007 return;
2008 }
2009
2010 InitDataBlock();
2011
2012 block_iter_.Seek(target);
2013
2014 FindKeyForward();
2015 assert(
2016 !block_iter_.Valid() ||
2017 (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
2018 (!key_includes_seq_ &&
2019 icomp_.user_comparator()->Compare(ExtractUserKey(target),
2020 block_iter_.key()) <= 0));
2021}
2022
2023template <class TBlockIter, typename TValue>
2024void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
2025 const Slice& target) {
2026 is_out_of_bound_ = false;
2027 if (!CheckPrefixMayMatch(target)) {
2028 ResetDataIter();
2029 return;
2030 }
2031
2032 SavePrevIndexValue();
2033
2034 // Call Seek() rather than SeekForPrev() in the index block, because the
2035 // target data block will likely to contain the position for `target`, the
2036 // same as Seek(), rather than than before.
2037 // For example, if we have three data blocks, each containing two keys:
2038 // [2, 4] [6, 8] [10, 12]
2039 // (the keys in the index block would be [4, 8, 12])
2040 // and the user calls SeekForPrev(7), we need to go to the second block,
2041 // just like if they call Seek(7).
2042 // The only case where the block is difference is when they seek to a position
2043 // in the boundary. For example, if they SeekForPrev(5), we should go to the
2044 // first block, rather than the second. However, we don't have the information
2045 // to distinguish the two unless we read the second block. In this case, we'll
2046 // end up with reading two blocks.
2047 index_iter_->Seek(target);
2048
2049 if (!index_iter_->Valid()) {
2050 index_iter_->SeekToLast();
2051 if (!index_iter_->Valid()) {
2052 ResetDataIter();
2053 block_iter_points_to_real_block_ = false;
2054 return;
2055 }
2056 }
2057
2058 InitDataBlock();
2059
2060 block_iter_.SeekForPrev(target);
2061
2062 FindKeyBackward();
2063 assert(!block_iter_.Valid() ||
2064 icomp_.Compare(target, block_iter_.key()) >= 0);
2065}
2066
2067template <class TBlockIter, typename TValue>
2068void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2069 is_out_of_bound_ = false;
2070 SavePrevIndexValue();
2071 index_iter_->SeekToFirst();
2072 if (!index_iter_->Valid()) {
2073 ResetDataIter();
2074 return;
2075 }
2076 InitDataBlock();
2077 block_iter_.SeekToFirst();
2078 FindKeyForward();
2079}
2080
2081template <class TBlockIter, typename TValue>
2082void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2083 is_out_of_bound_ = false;
2084 SavePrevIndexValue();
2085 index_iter_->SeekToLast();
2086 if (!index_iter_->Valid()) {
2087 ResetDataIter();
2088 return;
2089 }
2090 InitDataBlock();
2091 block_iter_.SeekToLast();
2092 FindKeyBackward();
2093}
2094
2095template <class TBlockIter, typename TValue>
2096void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2097 assert(block_iter_points_to_real_block_);
2098 block_iter_.Next();
2099 FindKeyForward();
2100}
2101
2102template <class TBlockIter, typename TValue>
2103void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2104 assert(block_iter_points_to_real_block_);
2105 block_iter_.Prev();
2106 FindKeyBackward();
2107}
2108
2109template <class TBlockIter, typename TValue>
2110void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
2111 BlockHandle data_block_handle = index_iter_->value();
2112 if (!block_iter_points_to_real_block_ ||
2113 data_block_handle.offset() != prev_index_value_.offset() ||
2114 // if previous attempt of reading the block missed cache, try again
2115 block_iter_.status().IsIncomplete()) {
2116 if (block_iter_points_to_real_block_) {
2117 ResetDataIter();
2118 }
2119 auto* rep = table_->get_rep();
2120
2121 // Automatically prefetch additional data when a range scan (iterator) does
2122 // more than 2 sequential IOs. This is enabled only for user reads and when
2123 // ReadOptions.readahead_size is 0.
2124 if (!for_compaction_ && read_options_.readahead_size == 0) {
2125 num_file_reads_++;
2126 if (num_file_reads_ > 2) {
2127 if (!rep->file->use_direct_io() &&
2128 (data_block_handle.offset() +
2129 static_cast<size_t>(data_block_handle.size()) +
2130 kBlockTrailerSize >
2131 readahead_limit_)) {
2132 // Buffered I/O
2133 // Discarding the return status of Prefetch calls intentionally, as we
2134 // can fallback to reading from disk if Prefetch fails.
2135 rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
2136 readahead_limit_ =
2137 static_cast<size_t>(data_block_handle.offset() + readahead_size_);
2138 // Keep exponentially increasing readahead size until
2139 // kMaxReadaheadSize.
2140 readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2);
2141 } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
2142 // Direct I/O
2143 // Let FilePrefetchBuffer take care of the readahead.
2144 prefetch_buffer_.reset(new FilePrefetchBuffer(
2145 rep->file.get(), kInitReadaheadSize, kMaxReadaheadSize));
2146 }
2147 }
2148 }
2149
2150 Status s;
2151 BlockBasedTable::NewDataBlockIterator<TBlockIter>(
2152 rep, read_options_, data_block_handle, &block_iter_, is_index_,
2153 key_includes_seq_, index_key_is_full_,
2154 /* get_context */ nullptr, s, prefetch_buffer_.get());
2155 block_iter_points_to_real_block_ = true;
2156 }
2157}
2158
2159template <class TBlockIter, typename TValue>
2160void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
2161 assert(!is_out_of_bound_);
2162 // TODO the while loop inherits from two-level-iterator. We don't know
2163 // whether a block can be empty so it can be replaced by an "if".
2164 while (!block_iter_.Valid()) {
2165 if (!block_iter_.status().ok()) {
2166 return;
2167 }
2168 ResetDataIter();
2169 // We used to check the current index key for upperbound.
2170 // It will only save a data reading for a small percentage of use cases,
2171 // so for code simplicity, we removed it. We can add it back if there is a
2172 // significnat performance regression.
2173 index_iter_->Next();
2174
2175 if (index_iter_->Valid()) {
2176 InitDataBlock();
2177 block_iter_.SeekToFirst();
2178 } else {
2179 return;
2180 }
2181 }
2182
2183 // Check upper bound on the current key
2184 bool reached_upper_bound =
2185 (read_options_.iterate_upper_bound != nullptr &&
2186 block_iter_points_to_real_block_ && block_iter_.Valid() &&
2187 icomp_.user_comparator()->Compare(ExtractUserKey(block_iter_.key()),
2188 *read_options_.iterate_upper_bound) >=
2189 0);
2190 TEST_SYNC_POINT_CALLBACK(
2191 "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound",
2192 &reached_upper_bound);
2193 if (reached_upper_bound) {
2194 is_out_of_bound_ = true;
2195 return;
2196 }
2197}
2198
2199template <class TBlockIter, typename TValue>
2200void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
2201 assert(!is_out_of_bound_);
2202 while (!block_iter_.Valid()) {
2203 if (!block_iter_.status().ok()) {
2204 return;
2205 }
2206
2207 ResetDataIter();
2208 index_iter_->Prev();
2209
2210 if (index_iter_->Valid()) {
2211 InitDataBlock();
2212 block_iter_.SeekToLast();
2213 } else {
2214 return;
2215 }
2216 }
2217
2218 // We could have check lower bound here too, but we opt not to do it for
2219 // code simplicity.
2220}
2221
2222InternalIterator* BlockBasedTable::NewIterator(
2223 const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2224 Arena* arena, bool skip_filters, bool for_compaction) {
2225 bool need_upper_bound_check =
2226 PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2227 const bool kIsNotIndex = false;
2228 if (arena == nullptr) {
2229 return new BlockBasedTableIterator<DataBlockIter>(
2230 this, read_options, rep_->internal_comparator,
2231 NewIndexIterator(
2232 read_options,
2233 need_upper_bound_check &&
2234 rep_->index_type == BlockBasedTableOptions::kHashSearch),
2235 !skip_filters && !read_options.total_order_seek &&
2236 prefix_extractor != nullptr,
2237 need_upper_bound_check, prefix_extractor, kIsNotIndex,
2238 true /*key_includes_seq*/, for_compaction);
2239 } else {
2240 auto* mem =
2241 arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
2242 return new (mem) BlockBasedTableIterator<DataBlockIter>(
2243 this, read_options, rep_->internal_comparator,
2244 NewIndexIterator(read_options, need_upper_bound_check),
2245 !skip_filters && !read_options.total_order_seek &&
2246 prefix_extractor != nullptr,
2247 need_upper_bound_check, prefix_extractor, kIsNotIndex,
2248 true /*key_includes_seq*/, for_compaction);
2249 }
7c673cae
FG
2250}
2251
2252InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
2253 const ReadOptions& read_options) {
2254 if (rep_->range_del_handle.IsNull()) {
2255 // The block didn't exist, nullptr indicates no range tombstones.
2256 return nullptr;
2257 }
2258 if (rep_->range_del_entry.cache_handle != nullptr) {
2259 // We have a handle to an uncompressed block cache entry that's held for
2260 // this table's lifetime. Increment its refcount before returning an
2261 // iterator based on it since the returned iterator may outlive this table
2262 // reader.
2263 assert(rep_->range_del_entry.value != nullptr);
2264 Cache* block_cache = rep_->table_options.block_cache.get();
2265 assert(block_cache != nullptr);
2266 if (block_cache->Ref(rep_->range_del_entry.cache_handle)) {
11fdf7f2
TL
2267 auto iter = rep_->range_del_entry.value->NewIterator<DataBlockIter>(
2268 &rep_->internal_comparator,
2269 rep_->internal_comparator.user_comparator());
7c673cae
FG
2270 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
2271 rep_->range_del_entry.cache_handle);
2272 return iter;
2273 }
2274 }
11fdf7f2
TL
2275 // The meta-block exists but isn't in uncompressed block cache (maybe
2276 // because it is disabled), so go through the full lookup process.
2277 return NewDataBlockIterator<DataBlockIter>(rep_, read_options,
2278 rep_->range_del_handle);
7c673cae
FG
2279}
2280
11fdf7f2
TL
2281bool BlockBasedTable::FullFilterKeyMayMatch(
2282 const ReadOptions& read_options, FilterBlockReader* filter,
2283 const Slice& internal_key, const bool no_io,
2284 const SliceTransform* prefix_extractor) const {
7c673cae
FG
2285 if (filter == nullptr || filter->IsBlockBased()) {
2286 return true;
2287 }
2288 Slice user_key = ExtractUserKey(internal_key);
2289 const Slice* const const_ikey_ptr = &internal_key;
11fdf7f2 2290 bool may_match = true;
7c673cae 2291 if (filter->whole_key_filtering()) {
11fdf7f2
TL
2292 may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid,
2293 no_io, const_ikey_ptr);
2294 } else if (!read_options.total_order_seek && prefix_extractor &&
2295 rep_->table_properties->prefix_extractor_name.compare(
2296 prefix_extractor->Name()) == 0 &&
2297 prefix_extractor->InDomain(user_key) &&
2298 !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
2299 prefix_extractor, kNotValid, false,
2300 const_ikey_ptr)) {
2301 may_match = false;
2302 }
2303 if (may_match) {
2304 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
7c673cae 2305 }
11fdf7f2 2306 return may_match;
7c673cae
FG
2307}
2308
2309Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
11fdf7f2
TL
2310 GetContext* get_context,
2311 const SliceTransform* prefix_extractor,
2312 bool skip_filters) {
2313 assert(key.size() >= 8); // key must be internal key
7c673cae
FG
2314 Status s;
2315 const bool no_io = read_options.read_tier == kBlockCacheTier;
2316 CachableEntry<FilterBlockReader> filter_entry;
2317 if (!skip_filters) {
11fdf7f2
TL
2318 filter_entry =
2319 GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
2320 read_options.read_tier == kBlockCacheTier, get_context);
7c673cae
FG
2321 }
2322 FilterBlockReader* filter = filter_entry.value;
2323
2324 // First check the full filter
2325 // If full filter not useful, Then go into each block
11fdf7f2
TL
2326 if (!FullFilterKeyMayMatch(read_options, filter, key, no_io,
2327 prefix_extractor)) {
7c673cae
FG
2328 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2329 } else {
11fdf7f2
TL
2330 IndexBlockIter iiter_on_stack;
2331 // if prefix_extractor found in block differs from options, disable
2332 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2333 bool need_upper_bound_check = false;
2334 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2335 need_upper_bound_check = PrefixExtractorChanged(
2336 rep_->table_properties.get(), prefix_extractor);
2337 }
2338 auto iiter =
2339 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2340 /* index_entry */ nullptr, get_context);
2341 std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
7c673cae
FG
2342 if (iiter != &iiter_on_stack) {
2343 iiter_unique_ptr.reset(iiter);
2344 }
2345
11fdf7f2 2346 bool matched = false; // if such user key mathced a key in SST
7c673cae
FG
2347 bool done = false;
2348 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
11fdf7f2 2349 BlockHandle handle = iiter->value();
7c673cae 2350
7c673cae
FG
2351 bool not_exist_in_filter =
2352 filter != nullptr && filter->IsBlockBased() == true &&
11fdf7f2
TL
2353 !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
2354 handle.offset(), no_io);
7c673cae
FG
2355
2356 if (not_exist_in_filter) {
2357 // Not found
2358 // TODO: think about interaction with Merge. If a user key cannot
2359 // cross one data block, we should be fine.
2360 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2361 break;
2362 } else {
11fdf7f2
TL
2363 DataBlockIter biter;
2364 NewDataBlockIterator<DataBlockIter>(
2365 rep_, read_options, iiter->value(), &biter, false,
2366 true /* key_includes_seq */, get_context);
7c673cae
FG
2367
2368 if (read_options.read_tier == kBlockCacheTier &&
2369 biter.status().IsIncomplete()) {
2370 // couldn't get block from block_cache
11fdf7f2
TL
2371 // Update Saver.state to Found because we are only looking for
2372 // whether we can guarantee the key is not there when "no_io" is set
7c673cae
FG
2373 get_context->MarkKeyMayExist();
2374 break;
2375 }
2376 if (!biter.status().ok()) {
2377 s = biter.status();
2378 break;
2379 }
2380
11fdf7f2
TL
2381 bool may_exist = biter.SeekForGet(key);
2382 if (!may_exist) {
2383 // HashSeek cannot find the key this block and the the iter is not
2384 // the end of the block, i.e. cannot be in the following blocks
2385 // either. In this case, the seek_key cannot be found, so we break
2386 // from the top level for-loop.
2387 break;
2388 }
2389
7c673cae 2390 // Call the *saver function on each entry/block until it returns false
11fdf7f2 2391 for (; biter.Valid(); biter.Next()) {
7c673cae
FG
2392 ParsedInternalKey parsed_key;
2393 if (!ParseInternalKey(biter.key(), &parsed_key)) {
2394 s = Status::Corruption(Slice());
2395 }
2396
11fdf7f2
TL
2397 if (!get_context->SaveValue(
2398 parsed_key, biter.value(), &matched,
2399 biter.IsValuePinned() ? &biter : nullptr)) {
7c673cae
FG
2400 done = true;
2401 break;
2402 }
2403 }
2404 s = biter.status();
2405 }
2406 if (done) {
2407 // Avoid the extra Next which is expensive in two-level indexes
2408 break;
2409 }
2410 }
11fdf7f2
TL
2411 if (matched && filter != nullptr && !filter->IsBlockBased()) {
2412 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2413 }
7c673cae
FG
2414 if (s.ok()) {
2415 s = iiter->status();
2416 }
2417 }
2418
2419 // if rep_->filter_entry is not set, we should call Release(); otherwise
2420 // don't call, in this case we have a local copy in rep_->filter_entry,
2421 // it's pinned to the cache and will be released in the destructor
2422 if (!rep_->filter_entry.IsSet()) {
2423 filter_entry.Release(rep_->table_options.block_cache.get());
2424 }
2425 return s;
2426}
2427
2428Status BlockBasedTable::Prefetch(const Slice* const begin,
2429 const Slice* const end) {
2430 auto& comparator = rep_->internal_comparator;
11fdf7f2 2431 auto user_comparator = comparator.user_comparator();
7c673cae
FG
2432 // pre-condition
2433 if (begin && end && comparator.Compare(*begin, *end) > 0) {
2434 return Status::InvalidArgument(*begin, *end);
2435 }
2436
11fdf7f2
TL
2437 IndexBlockIter iiter_on_stack;
2438 auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
2439 std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
7c673cae 2440 if (iiter != &iiter_on_stack) {
11fdf7f2
TL
2441 iiter_unique_ptr =
2442 std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
7c673cae
FG
2443 }
2444
2445 if (!iiter->status().ok()) {
2446 // error opening index iterator
2447 return iiter->status();
2448 }
2449
2450 // indicates if we are on the last page that need to be pre-fetched
2451 bool prefetching_boundary_page = false;
2452
2453 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
2454 iiter->Next()) {
11fdf7f2
TL
2455 BlockHandle block_handle = iiter->value();
2456 const bool is_user_key = rep_->table_properties &&
2457 rep_->table_properties->index_key_is_user_key > 0;
2458 if (end &&
2459 ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
2460 (is_user_key &&
2461 user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
7c673cae
FG
2462 if (prefetching_boundary_page) {
2463 break;
2464 }
2465
2466 // The index entry represents the last key in the data block.
2467 // We should load this page into memory as well, but no more
2468 prefetching_boundary_page = true;
2469 }
2470
2471 // Load the block specified by the block_handle into the block cache
11fdf7f2
TL
2472 DataBlockIter biter;
2473 NewDataBlockIterator<DataBlockIter>(rep_, ReadOptions(), block_handle,
2474 &biter);
7c673cae
FG
2475
2476 if (!biter.status().ok()) {
2477 // there was an unexpected error while pre-fetching
2478 return biter.status();
2479 }
2480 }
2481
2482 return Status::OK();
2483}
2484
11fdf7f2
TL
2485Status BlockBasedTable::VerifyChecksum() {
2486 Status s;
2487 // Check Meta blocks
2488 std::unique_ptr<Block> meta;
2489 std::unique_ptr<InternalIterator> meta_iter;
2490 s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter);
2491 if (s.ok()) {
2492 s = VerifyChecksumInBlocks(meta_iter.get());
2493 if (!s.ok()) {
2494 return s;
2495 }
2496 } else {
2497 return s;
2498 }
2499 // Check Data blocks
2500 IndexBlockIter iiter_on_stack;
2501 InternalIteratorBase<BlockHandle>* iiter =
2502 NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
2503 std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
2504 if (iiter != &iiter_on_stack) {
2505 iiter_unique_ptr =
2506 std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
2507 }
2508 if (!iiter->status().ok()) {
2509 // error opening index iterator
2510 return iiter->status();
2511 }
2512 s = VerifyChecksumInBlocks(iiter);
2513 return s;
2514}
2515
2516Status BlockBasedTable::VerifyChecksumInBlocks(
2517 InternalIteratorBase<BlockHandle>* index_iter) {
2518 Status s;
2519 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2520 s = index_iter->status();
2521 if (!s.ok()) {
2522 break;
2523 }
2524 BlockHandle handle = index_iter->value();
2525 BlockContents contents;
2526 Slice dummy_comp_dict;
2527 BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
2528 rep_->footer, ReadOptions(), handle, &contents,
2529 rep_->ioptions, false /* decompress */,
2530 dummy_comp_dict /*compression dict*/,
2531 rep_->persistent_cache_options);
2532 s = block_fetcher.ReadBlockContents();
2533 if (!s.ok()) {
2534 break;
2535 }
2536 }
2537 return s;
2538}
2539
2540Status BlockBasedTable::VerifyChecksumInBlocks(
2541 InternalIteratorBase<Slice>* index_iter) {
2542 Status s;
2543 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2544 s = index_iter->status();
2545 if (!s.ok()) {
2546 break;
2547 }
2548 BlockHandle handle;
2549 Slice input = index_iter->value();
2550 s = handle.DecodeFrom(&input);
2551 BlockContents contents;
2552 Slice dummy_comp_dict;
2553 BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
2554 rep_->footer, ReadOptions(), handle, &contents,
2555 rep_->ioptions, false /* decompress */,
2556 dummy_comp_dict /*compression dict*/,
2557 rep_->persistent_cache_options);
2558 s = block_fetcher.ReadBlockContents();
2559 if (!s.ok()) {
2560 break;
2561 }
2562 }
2563 return s;
2564}
2565
7c673cae
FG
2566bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
2567 const Slice& key) {
11fdf7f2
TL
2568 std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
2569 NewIndexIterator(options));
7c673cae
FG
2570 iiter->Seek(key);
2571 assert(iiter->Valid());
2572 CachableEntry<Block> block;
2573
11fdf7f2 2574 BlockHandle handle = iiter->value();
7c673cae
FG
2575 Cache* block_cache = rep_->table_options.block_cache.get();
2576 assert(block_cache != nullptr);
2577
2578 char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2579 Slice cache_key =
11fdf7f2
TL
2580 GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
2581 cache_key_storage);
7c673cae
FG
2582 Slice ckey;
2583
11fdf7f2 2584 Status s;
7c673cae
FG
2585 s = GetDataBlockFromCache(
2586 cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
2587 rep_->table_options.format_version,
2588 rep_->compression_dict_block ? rep_->compression_dict_block->data
2589 : Slice(),
2590 0 /* read_amp_bytes_per_bit */);
2591 assert(s.ok());
2592 bool in_cache = block.value != nullptr;
2593 if (in_cache) {
2594 ReleaseCachedEntry(block_cache, block.cache_handle);
2595 }
2596 return in_cache;
2597}
2598
11fdf7f2 2599BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
7c673cae
FG
2600 // Some old version of block-based tables don't have index type present in
2601 // table properties. If that's the case we can safely use the kBinarySearch.
11fdf7f2
TL
2602 BlockBasedTableOptions::IndexType index_type_on_file =
2603 BlockBasedTableOptions::kBinarySearch;
7c673cae
FG
2604 if (rep_->table_properties) {
2605 auto& props = rep_->table_properties->user_collected_properties;
2606 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
2607 if (pos != props.end()) {
2608 index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
2609 DecodeFixed32(pos->second.c_str()));
11fdf7f2
TL
2610 // update index_type with the true type
2611 rep_->index_type = index_type_on_file;
7c673cae
FG
2612 }
2613 }
11fdf7f2
TL
2614 return index_type_on_file;
2615}
2616
2617// REQUIRES: The following fields of rep_ should have already been populated:
2618// 1. file
2619// 2. index_handle,
2620// 3. options
2621// 4. internal_comparator
2622// 5. index_type
2623Status BlockBasedTable::CreateIndexReader(
2624 FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
2625 InternalIterator* preloaded_meta_index_iter, int level) {
2626 auto index_type_on_file = UpdateIndexType();
7c673cae
FG
2627
2628 auto file = rep_->file.get();
11fdf7f2 2629 const InternalKeyComparator* icomparator = &rep_->internal_comparator;
7c673cae 2630 const Footer& footer = rep_->footer;
11fdf7f2
TL
2631
2632 // kHashSearch requires non-empty prefix_extractor but bypass checking
2633 // prefix_extractor here since we have no access to MutableCFOptions.
2634 // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
2635 // If prefix_extractor does not match prefix_extractor_name from table
2636 // properties, turn off Hash Index by setting total_order_seek to true
7c673cae
FG
2637
2638 switch (index_type_on_file) {
2639 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
2640 return PartitionIndexReader::Create(
11fdf7f2
TL
2641 this, file, prefetch_buffer, footer, footer.index_handle(),
2642 rep_->ioptions, icomparator, index_reader,
2643 rep_->persistent_cache_options, level,
2644 rep_->table_properties == nullptr ||
2645 rep_->table_properties->index_key_is_user_key == 0,
2646 rep_->table_properties == nullptr ||
2647 rep_->table_properties->index_value_is_delta_encoded == 0);
7c673cae
FG
2648 }
2649 case BlockBasedTableOptions::kBinarySearch: {
2650 return BinarySearchIndexReader::Create(
11fdf7f2
TL
2651 file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
2652 icomparator, index_reader, rep_->persistent_cache_options,
2653 rep_->table_properties == nullptr ||
2654 rep_->table_properties->index_key_is_user_key == 0,
2655 rep_->table_properties == nullptr ||
2656 rep_->table_properties->index_value_is_delta_encoded == 0);
7c673cae
FG
2657 }
2658 case BlockBasedTableOptions::kHashSearch: {
2659 std::unique_ptr<Block> meta_guard;
2660 std::unique_ptr<InternalIterator> meta_iter_guard;
2661 auto meta_index_iter = preloaded_meta_index_iter;
2662 if (meta_index_iter == nullptr) {
11fdf7f2
TL
2663 auto s =
2664 ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard);
7c673cae
FG
2665 if (!s.ok()) {
2666 // we simply fall back to binary search in case there is any
2667 // problem with prefix hash index loading.
2668 ROCKS_LOG_WARN(rep_->ioptions.info_log,
2669 "Unable to read the metaindex block."
2670 " Fall back to binary search index.");
2671 return BinarySearchIndexReader::Create(
11fdf7f2
TL
2672 file, prefetch_buffer, footer, footer.index_handle(),
2673 rep_->ioptions, icomparator, index_reader,
2674 rep_->persistent_cache_options,
2675 rep_->table_properties == nullptr ||
2676 rep_->table_properties->index_key_is_user_key == 0,
2677 rep_->table_properties == nullptr ||
2678 rep_->table_properties->index_value_is_delta_encoded == 0);
7c673cae
FG
2679 }
2680 meta_index_iter = meta_iter_guard.get();
2681 }
2682
2683 return HashIndexReader::Create(
11fdf7f2
TL
2684 rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
2685 rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
2686 index_reader, rep_->hash_index_allow_collision,
2687 rep_->persistent_cache_options,
2688 rep_->table_properties == nullptr ||
2689 rep_->table_properties->index_key_is_user_key == 0,
2690 rep_->table_properties == nullptr ||
2691 rep_->table_properties->index_value_is_delta_encoded == 0);
7c673cae
FG
2692 }
2693 default: {
2694 std::string error_message =
11fdf7f2 2695 "Unrecognized index type: " + ToString(index_type_on_file);
7c673cae
FG
2696 return Status::InvalidArgument(error_message.c_str());
2697 }
2698 }
2699}
2700
2701uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
11fdf7f2
TL
2702 unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
2703 NewIndexIterator(ReadOptions()));
7c673cae
FG
2704
2705 index_iter->Seek(key);
2706 uint64_t result;
2707 if (index_iter->Valid()) {
11fdf7f2
TL
2708 BlockHandle handle = index_iter->value();
2709 result = handle.offset();
7c673cae
FG
2710 } else {
2711 // key is past the last key in the file. If table_properties is not
2712 // available, approximate the offset by returning the offset of the
2713 // metaindex block (which is right near the end of the file).
2714 result = 0;
2715 if (rep_->table_properties) {
2716 result = rep_->table_properties->data_size;
2717 }
2718 // table_properties is not present in the table.
2719 if (result == 0) {
2720 result = rep_->footer.metaindex_handle().offset();
2721 }
2722 }
2723 return result;
2724}
2725
2726bool BlockBasedTable::TEST_filter_block_preloaded() const {
2727 return rep_->filter != nullptr;
2728}
2729
2730bool BlockBasedTable::TEST_index_reader_preloaded() const {
2731 return rep_->index_reader != nullptr;
2732}
2733
2734Status BlockBasedTable::GetKVPairsFromDataBlocks(
2735 std::vector<KVPairBlock>* kv_pair_blocks) {
11fdf7f2 2736 std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
7c673cae
FG
2737 NewIndexIterator(ReadOptions()));
2738
2739 Status s = blockhandles_iter->status();
2740 if (!s.ok()) {
2741 // Cannot read Index Block
2742 return s;
2743 }
2744
2745 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2746 blockhandles_iter->Next()) {
2747 s = blockhandles_iter->status();
2748
2749 if (!s.ok()) {
2750 break;
2751 }
2752
2753 std::unique_ptr<InternalIterator> datablock_iter;
11fdf7f2
TL
2754 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
2755 rep_, ReadOptions(), blockhandles_iter->value()));
7c673cae
FG
2756 s = datablock_iter->status();
2757
2758 if (!s.ok()) {
2759 // Error reading the block - Skipped
2760 continue;
2761 }
2762
2763 KVPairBlock kv_pair_block;
2764 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
2765 datablock_iter->Next()) {
2766 s = datablock_iter->status();
2767 if (!s.ok()) {
2768 // Error reading the block - Skipped
2769 break;
2770 }
2771 const Slice& key = datablock_iter->key();
2772 const Slice& value = datablock_iter->value();
2773 std::string key_copy = std::string(key.data(), key.size());
2774 std::string value_copy = std::string(value.data(), value.size());
2775
2776 kv_pair_block.push_back(
2777 std::make_pair(std::move(key_copy), std::move(value_copy)));
2778 }
2779 kv_pair_blocks->push_back(std::move(kv_pair_block));
2780 }
2781 return Status::OK();
2782}
2783
11fdf7f2
TL
2784Status BlockBasedTable::DumpTable(WritableFile* out_file,
2785 const SliceTransform* prefix_extractor) {
7c673cae
FG
2786 // Output Footer
2787 out_file->Append(
2788 "Footer Details:\n"
2789 "--------------------------------------\n"
2790 " ");
2791 out_file->Append(rep_->footer.ToString().c_str());
2792 out_file->Append("\n");
2793
2794 // Output MetaIndex
2795 out_file->Append(
2796 "Metaindex Details:\n"
2797 "--------------------------------------\n");
2798 std::unique_ptr<Block> meta;
2799 std::unique_ptr<InternalIterator> meta_iter;
11fdf7f2
TL
2800 Status s =
2801 ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter);
7c673cae
FG
2802 if (s.ok()) {
2803 for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
2804 s = meta_iter->status();
2805 if (!s.ok()) {
2806 return s;
2807 }
2808 if (meta_iter->key() == rocksdb::kPropertiesBlock) {
2809 out_file->Append(" Properties block handle: ");
2810 out_file->Append(meta_iter->value().ToString(true).c_str());
2811 out_file->Append("\n");
2812 } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
2813 out_file->Append(" Compression dictionary block handle: ");
2814 out_file->Append(meta_iter->value().ToString(true).c_str());
2815 out_file->Append("\n");
2816 } else if (strstr(meta_iter->key().ToString().c_str(),
2817 "filter.rocksdb.") != nullptr) {
2818 out_file->Append(" Filter block handle: ");
2819 out_file->Append(meta_iter->value().ToString(true).c_str());
2820 out_file->Append("\n");
2821 } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
2822 out_file->Append(" Range deletion block handle: ");
2823 out_file->Append(meta_iter->value().ToString(true).c_str());
2824 out_file->Append("\n");
2825 }
2826 }
2827 out_file->Append("\n");
2828 } else {
2829 return s;
2830 }
2831
2832 // Output TableProperties
2833 const rocksdb::TableProperties* table_properties;
2834 table_properties = rep_->table_properties.get();
2835
2836 if (table_properties != nullptr) {
2837 out_file->Append(
2838 "Table Properties:\n"
2839 "--------------------------------------\n"
2840 " ");
2841 out_file->Append(table_properties->ToString("\n ", ": ").c_str());
2842 out_file->Append("\n");
7c673cae 2843
11fdf7f2
TL
2844 // Output Filter blocks
2845 if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
2846 // Support only BloomFilter as off now
2847 rocksdb::BlockBasedTableOptions table_options;
2848 table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
2849 if (table_properties->filter_policy_name.compare(
2850 table_options.filter_policy->Name()) == 0) {
2851 std::string filter_block_key = kFilterBlockPrefix;
2852 filter_block_key.append(table_properties->filter_policy_name);
2853 BlockHandle handle;
2854 if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
2855 BlockContents block;
2856 Slice dummy_comp_dict;
2857 BlockFetcher block_fetcher(
2858 rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
2859 ReadOptions(), handle, &block, rep_->ioptions,
2860 false /*decompress*/, dummy_comp_dict /*compression dict*/,
2861 rep_->persistent_cache_options);
2862 s = block_fetcher.ReadBlockContents();
2863 if (!s.ok()) {
2864 rep_->filter.reset(new BlockBasedFilterBlockReader(
2865 prefix_extractor, table_options,
2866 table_options.whole_key_filtering, std::move(block),
2867 rep_->ioptions.statistics));
2868 }
7c673cae
FG
2869 }
2870 }
2871 }
2872 }
2873 if (rep_->filter) {
2874 out_file->Append(
2875 "Filter Details:\n"
2876 "--------------------------------------\n"
2877 " ");
2878 out_file->Append(rep_->filter->ToString().c_str());
2879 out_file->Append("\n");
2880 }
2881
2882 // Output Index block
2883 s = DumpIndexBlock(out_file);
2884 if (!s.ok()) {
2885 return s;
2886 }
2887
2888 // Output compression dictionary
2889 if (rep_->compression_dict_block != nullptr) {
2890 auto compression_dict = rep_->compression_dict_block->data;
2891 out_file->Append(
2892 "Compression Dictionary:\n"
2893 "--------------------------------------\n");
2894 out_file->Append(" size (bytes): ");
2895 out_file->Append(rocksdb::ToString(compression_dict.size()));
2896 out_file->Append("\n\n");
2897 out_file->Append(" HEX ");
2898 out_file->Append(compression_dict.ToString(true).c_str());
2899 out_file->Append("\n\n");
2900 }
2901
2902 // Output range deletions block
2903 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
2904 if (range_del_iter != nullptr) {
2905 range_del_iter->SeekToFirst();
2906 if (range_del_iter->Valid()) {
2907 out_file->Append(
2908 "Range deletions:\n"
2909 "--------------------------------------\n"
2910 " ");
2911 for (; range_del_iter->Valid(); range_del_iter->Next()) {
2912 DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
2913 }
2914 out_file->Append("\n");
2915 }
2916 delete range_del_iter;
2917 }
2918 // Output Data blocks
2919 s = DumpDataBlocks(out_file);
2920
2921 return s;
2922}
2923
2924void BlockBasedTable::Close() {
11fdf7f2
TL
2925 if (rep_->closed) {
2926 return;
2927 }
7c673cae
FG
2928 rep_->filter_entry.Release(rep_->table_options.block_cache.get());
2929 rep_->index_entry.Release(rep_->table_options.block_cache.get());
2930 rep_->range_del_entry.Release(rep_->table_options.block_cache.get());
2931 // cleanup index and filter blocks to avoid accessing dangling pointer
2932 if (!rep_->table_options.no_block_cache) {
2933 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2934 // Get the filter block key
2935 auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
11fdf7f2 2936 rep_->filter_handle, cache_key);
7c673cae
FG
2937 rep_->table_options.block_cache.get()->Erase(key);
2938 // Get the index block key
2939 key = GetCacheKeyFromOffset(rep_->cache_key_prefix,
2940 rep_->cache_key_prefix_size,
2941 rep_->dummy_index_reader_offset, cache_key);
2942 rep_->table_options.block_cache.get()->Erase(key);
2943 }
11fdf7f2 2944 rep_->closed = true;
7c673cae
FG
2945}
2946
2947Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
2948 out_file->Append(
2949 "Index Details:\n"
2950 "--------------------------------------\n");
11fdf7f2 2951 std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
7c673cae
FG
2952 NewIndexIterator(ReadOptions()));
2953 Status s = blockhandles_iter->status();
2954 if (!s.ok()) {
2955 out_file->Append("Can not read Index Block \n\n");
2956 return s;
2957 }
2958
2959 out_file->Append(" Block key hex dump: Data block handle\n");
2960 out_file->Append(" Block key ascii\n\n");
2961 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2962 blockhandles_iter->Next()) {
2963 s = blockhandles_iter->status();
2964 if (!s.ok()) {
2965 break;
2966 }
2967 Slice key = blockhandles_iter->key();
11fdf7f2 2968 Slice user_key;
7c673cae 2969 InternalKey ikey;
11fdf7f2
TL
2970 if (rep_->table_properties &&
2971 rep_->table_properties->index_key_is_user_key != 0) {
2972 user_key = key;
2973 } else {
2974 ikey.DecodeFrom(key);
2975 user_key = ikey.user_key();
2976 }
7c673cae
FG
2977
2978 out_file->Append(" HEX ");
11fdf7f2 2979 out_file->Append(user_key.ToString(true).c_str());
7c673cae
FG
2980 out_file->Append(": ");
2981 out_file->Append(blockhandles_iter->value().ToString(true).c_str());
2982 out_file->Append("\n");
2983
11fdf7f2 2984 std::string str_key = user_key.ToString();
7c673cae
FG
2985 std::string res_key("");
2986 char cspace = ' ';
2987 for (size_t i = 0; i < str_key.size(); i++) {
2988 res_key.append(&str_key[i], 1);
2989 res_key.append(1, cspace);
2990 }
2991 out_file->Append(" ASCII ");
2992 out_file->Append(res_key.c_str());
2993 out_file->Append("\n ------\n");
2994 }
2995 out_file->Append("\n");
2996 return Status::OK();
2997}
2998
2999Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
11fdf7f2 3000 std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
7c673cae
FG
3001 NewIndexIterator(ReadOptions()));
3002 Status s = blockhandles_iter->status();
3003 if (!s.ok()) {
3004 out_file->Append("Can not read Index Block \n\n");
3005 return s;
3006 }
3007
3008 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
3009 uint64_t datablock_size_max = 0;
3010 uint64_t datablock_size_sum = 0;
3011
3012 size_t block_id = 1;
3013 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3014 block_id++, blockhandles_iter->Next()) {
3015 s = blockhandles_iter->status();
3016 if (!s.ok()) {
3017 break;
3018 }
3019
11fdf7f2 3020 BlockHandle bh = blockhandles_iter->value();
7c673cae
FG
3021 uint64_t datablock_size = bh.size();
3022 datablock_size_min = std::min(datablock_size_min, datablock_size);
3023 datablock_size_max = std::max(datablock_size_max, datablock_size);
3024 datablock_size_sum += datablock_size;
3025
3026 out_file->Append("Data Block # ");
3027 out_file->Append(rocksdb::ToString(block_id));
3028 out_file->Append(" @ ");
3029 out_file->Append(blockhandles_iter->value().ToString(true).c_str());
3030 out_file->Append("\n");
3031 out_file->Append("--------------------------------------\n");
3032
3033 std::unique_ptr<InternalIterator> datablock_iter;
11fdf7f2
TL
3034 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3035 rep_, ReadOptions(), blockhandles_iter->value()));
7c673cae
FG
3036 s = datablock_iter->status();
3037
3038 if (!s.ok()) {
3039 out_file->Append("Error reading the block - Skipped \n\n");
3040 continue;
3041 }
3042
3043 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3044 datablock_iter->Next()) {
3045 s = datablock_iter->status();
3046 if (!s.ok()) {
3047 out_file->Append("Error reading the block - Skipped \n");
3048 break;
3049 }
3050 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3051 }
3052 out_file->Append("\n");
3053 }
3054
3055 uint64_t num_datablocks = block_id - 1;
3056 if (num_datablocks) {
3057 double datablock_size_avg =
3058 static_cast<double>(datablock_size_sum) / num_datablocks;
3059 out_file->Append("Data Block Summary:\n");
3060 out_file->Append("--------------------------------------");
3061 out_file->Append("\n # data blocks: ");
3062 out_file->Append(rocksdb::ToString(num_datablocks));
3063 out_file->Append("\n min data block size: ");
3064 out_file->Append(rocksdb::ToString(datablock_size_min));
3065 out_file->Append("\n max data block size: ");
3066 out_file->Append(rocksdb::ToString(datablock_size_max));
3067 out_file->Append("\n avg data block size: ");
3068 out_file->Append(rocksdb::ToString(datablock_size_avg));
3069 out_file->Append("\n");
3070 }
3071
3072 return Status::OK();
3073}
3074
3075void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
3076 WritableFile* out_file) {
3077 InternalKey ikey;
3078 ikey.DecodeFrom(key);
3079
3080 out_file->Append(" HEX ");
3081 out_file->Append(ikey.user_key().ToString(true).c_str());
3082 out_file->Append(": ");
3083 out_file->Append(value.ToString(true).c_str());
3084 out_file->Append("\n");
3085
3086 std::string str_key = ikey.user_key().ToString();
3087 std::string str_value = value.ToString();
3088 std::string res_key(""), res_value("");
3089 char cspace = ' ';
3090 for (size_t i = 0; i < str_key.size(); i++) {
11fdf7f2
TL
3091 if (str_key[i] == '\0') {
3092 res_key.append("\\0", 2);
3093 } else {
3094 res_key.append(&str_key[i], 1);
3095 }
7c673cae
FG
3096 res_key.append(1, cspace);
3097 }
3098 for (size_t i = 0; i < str_value.size(); i++) {
11fdf7f2
TL
3099 if (str_value[i] == '\0') {
3100 res_value.append("\\0", 2);
3101 } else {
3102 res_value.append(&str_value[i], 1);
3103 }
7c673cae
FG
3104 res_value.append(1, cspace);
3105 }
3106
3107 out_file->Append(" ASCII ");
3108 out_file->Append(res_key.c_str());
3109 out_file->Append(": ");
3110 out_file->Append(res_value.c_str());
3111 out_file->Append("\n ------\n");
3112}
3113
3114namespace {
3115
11fdf7f2 3116void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
7c673cae
FG
3117 FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
3118 if (filter->statistics() != nullptr) {
3119 RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
11fdf7f2 3120 filter->ApproximateMemoryUsage());
7c673cae
FG
3121 }
3122 delete filter;
3123}
3124
11fdf7f2 3125void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) {
7c673cae
FG
3126 IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
3127 if (index_reader->statistics() != nullptr) {
3128 RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
11fdf7f2 3129 index_reader->ApproximateMemoryUsage());
7c673cae
FG
3130 }
3131 delete index_reader;
3132}
3133
3134} // anonymous namespace
3135
3136} // namespace rocksdb