]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based_table_reader.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / rocksdb / table / block_based_table_reader.cc
CommitLineData
7c673cae
FG
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under the BSD-style license found in the
3// LICENSE file in the root directory of this source tree. An additional grant
4// of patent rights can be found in the PATENTS file in the same directory.
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9#include "table/block_based_table_reader.h"
10
11#include <algorithm>
12#include <limits>
13#include <string>
14#include <utility>
15#include <vector>
16
17#include "db/dbformat.h"
18#include "db/pinned_iterators_manager.h"
19
20#include "rocksdb/cache.h"
21#include "rocksdb/comparator.h"
22#include "rocksdb/env.h"
23#include "rocksdb/filter_policy.h"
24#include "rocksdb/iterator.h"
25#include "rocksdb/options.h"
26#include "rocksdb/statistics.h"
27#include "rocksdb/table.h"
28#include "rocksdb/table_properties.h"
29
30#include "table/block.h"
31#include "table/block_based_filter_block.h"
32#include "table/block_based_table_factory.h"
33#include "table/block_prefix_index.h"
34#include "table/filter_block.h"
35#include "table/format.h"
36#include "table/full_filter_block.h"
37#include "table/get_context.h"
38#include "table/internal_iterator.h"
39#include "table/meta_blocks.h"
40#include "table/partitioned_filter_block.h"
41#include "table/persistent_cache_helper.h"
42#include "table/sst_file_writer_collectors.h"
43#include "table/two_level_iterator.h"
44
45#include "monitoring/perf_context_imp.h"
46#include "util/coding.h"
47#include "util/file_reader_writer.h"
48#include "util/stop_watch.h"
49#include "util/string_util.h"
50#include "util/sync_point.h"
51
52namespace rocksdb {
53
54extern const uint64_t kBlockBasedTableMagicNumber;
55extern const std::string kHashIndexPrefixesBlock;
56extern const std::string kHashIndexPrefixesMetadataBlock;
57using std::unique_ptr;
58
59typedef BlockBasedTable::IndexReader IndexReader;
60
61BlockBasedTable::~BlockBasedTable() {
62 Close();
63 delete rep_;
64}
65
66namespace {
67// Read the block identified by "handle" from "file".
68// The only relevant option is options.verify_checksums for now.
69// On failure return non-OK.
70// On success fill *result and return OK - caller owns *result
71// @param compression_dict Data for presetting the compression library's
72// dictionary.
73Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
74 const ReadOptions& options, const BlockHandle& handle,
75 std::unique_ptr<Block>* result,
76 const ImmutableCFOptions& ioptions, bool do_uncompress,
77 const Slice& compression_dict,
78 const PersistentCacheOptions& cache_options,
79 SequenceNumber global_seqno,
80 size_t read_amp_bytes_per_bit) {
81 BlockContents contents;
82 Status s = ReadBlockContents(file, footer, options, handle, &contents, ioptions,
83 do_uncompress, compression_dict, cache_options);
84 if (s.ok()) {
85 result->reset(new Block(std::move(contents), global_seqno,
86 read_amp_bytes_per_bit, ioptions.statistics));
87 }
88
89 return s;
90}
91
92// Delete the resource that is held by the iterator.
93template <class ResourceType>
94void DeleteHeldResource(void* arg, void* ignored) {
95 delete reinterpret_cast<ResourceType*>(arg);
96}
97
98// Delete the entry resided in the cache.
99template <class Entry>
100void DeleteCachedEntry(const Slice& key, void* value) {
101 auto entry = reinterpret_cast<Entry*>(value);
102 delete entry;
103}
104
105void DeleteCachedFilterEntry(const Slice& key, void* value);
106void DeleteCachedIndexEntry(const Slice& key, void* value);
107
108// Release the cached entry and decrement its ref count.
109void ReleaseCachedEntry(void* arg, void* h) {
110 Cache* cache = reinterpret_cast<Cache*>(arg);
111 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
112 cache->Release(handle);
113}
114
115Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
116 size_t cache_key_prefix_size, uint64_t offset,
117 char* cache_key) {
118 assert(cache_key != nullptr);
119 assert(cache_key_prefix_size != 0);
120 assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize);
121 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
122 char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset);
123 return Slice(cache_key, static_cast<size_t>(end - cache_key));
124}
125
126Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
127 Tickers block_cache_miss_ticker,
128 Tickers block_cache_hit_ticker,
129 Statistics* statistics) {
130 auto cache_handle = block_cache->Lookup(key, statistics);
131 if (cache_handle != nullptr) {
132 PERF_COUNTER_ADD(block_cache_hit_count, 1);
133 // overall cache hit
134 RecordTick(statistics, BLOCK_CACHE_HIT);
135 // total bytes read from cache
136 RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
137 block_cache->GetUsage(cache_handle));
138 // block-type specific cache hit
139 RecordTick(statistics, block_cache_hit_ticker);
140 } else {
141 // overall cache miss
142 RecordTick(statistics, BLOCK_CACHE_MISS);
143 // block-type specific cache miss
144 RecordTick(statistics, block_cache_miss_ticker);
145 }
146
147 return cache_handle;
148}
149
150} // namespace
151
152// Index that allows binary search lookup in a two-level index structure.
153class PartitionIndexReader : public IndexReader, public Cleanable {
154 public:
155 // Read the partition index from the file and create an instance for
156 // `PartitionIndexReader`.
157 // On success, index_reader will be populated; otherwise it will remain
158 // unmodified.
159 static Status Create(BlockBasedTable* table, RandomAccessFileReader* file,
160 const Footer& footer, const BlockHandle& index_handle,
161 const ImmutableCFOptions& ioptions,
162 const Comparator* comparator, IndexReader** index_reader,
163 const PersistentCacheOptions& cache_options,
164 const int level) {
165 std::unique_ptr<Block> index_block;
166 auto s = ReadBlockFromFile(
167 file, footer, ReadOptions(), index_handle, &index_block, ioptions,
168 true /* decompress */, Slice() /*compression dict*/, cache_options,
169 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
170
171 if (s.ok()) {
172 *index_reader =
173 new PartitionIndexReader(table, comparator, std::move(index_block),
174 ioptions.statistics, level);
175 }
176
177 return s;
178 }
179
180 // return a two-level iterator: first level is on the partition index
181 virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
182 bool dont_care = true) override {
183 // Filters are already checked before seeking the index
184 const bool skip_filters = true;
185 const bool is_index = true;
186 Cleanable* block_cache_cleaner = nullptr;
187 const bool pin_cached_indexes =
188 level_ == 0 &&
189 table_->rep_->table_options.pin_l0_filter_and_index_blocks_in_cache;
190 if (pin_cached_indexes) {
191 // Keep partition indexes into the cache as long as the partition index
192 // reader object is alive
193 block_cache_cleaner = this;
194 }
195 return NewTwoLevelIterator(
196 new BlockBasedTable::BlockEntryIteratorState(
197 table_, ReadOptions(), skip_filters, is_index, block_cache_cleaner),
198 index_block_->NewIterator(comparator_, nullptr, true));
199 // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
200 // on-stack
201 // BlockIter while the state is on heap
202 }
203
204 virtual size_t size() const override { return index_block_->size(); }
205 virtual size_t usable_size() const override {
206 return index_block_->usable_size();
207 }
208
209 virtual size_t ApproximateMemoryUsage() const override {
210 assert(index_block_);
211 return index_block_->ApproximateMemoryUsage();
212 }
213
214 private:
215 PartitionIndexReader(BlockBasedTable* table, const Comparator* comparator,
216 std::unique_ptr<Block>&& index_block, Statistics* stats,
217 const int level)
218 : IndexReader(comparator, stats),
219 table_(table),
220 index_block_(std::move(index_block)),
221 level_(level) {
222 assert(index_block_ != nullptr);
223 }
224 BlockBasedTable* table_;
225 std::unique_ptr<Block> index_block_;
226 int level_;
227};
228
229// Index that allows binary search lookup for the first key of each block.
230// This class can be viewed as a thin wrapper for `Block` class which already
231// supports binary search.
232class BinarySearchIndexReader : public IndexReader {
233 public:
234 // Read index from the file and create an intance for
235 // `BinarySearchIndexReader`.
236 // On success, index_reader will be populated; otherwise it will remain
237 // unmodified.
238 static Status Create(RandomAccessFileReader* file, const Footer& footer,
239 const BlockHandle& index_handle,
240 const ImmutableCFOptions &ioptions,
241 const Comparator* comparator, IndexReader** index_reader,
242 const PersistentCacheOptions& cache_options) {
243 std::unique_ptr<Block> index_block;
244 auto s = ReadBlockFromFile(
245 file, footer, ReadOptions(), index_handle, &index_block, ioptions,
246 true /* decompress */, Slice() /*compression dict*/, cache_options,
247 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
248
249 if (s.ok()) {
250 *index_reader = new BinarySearchIndexReader(
251 comparator, std::move(index_block), ioptions.statistics);
252 }
253
254 return s;
255 }
256
257 virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
258 bool dont_care = true) override {
259 return index_block_->NewIterator(comparator_, iter, true);
260 }
261
262 virtual size_t size() const override { return index_block_->size(); }
263 virtual size_t usable_size() const override {
264 return index_block_->usable_size();
265 }
266
267 virtual size_t ApproximateMemoryUsage() const override {
268 assert(index_block_);
269 return index_block_->ApproximateMemoryUsage();
270 }
271
272 private:
273 BinarySearchIndexReader(const Comparator* comparator,
274 std::unique_ptr<Block>&& index_block,
275 Statistics* stats)
276 : IndexReader(comparator, stats), index_block_(std::move(index_block)) {
277 assert(index_block_ != nullptr);
278 }
279 std::unique_ptr<Block> index_block_;
280};
281
282// Index that leverages an internal hash table to quicken the lookup for a given
283// key.
284class HashIndexReader : public IndexReader {
285 public:
286 static Status Create(const SliceTransform* hash_key_extractor,
287 const Footer& footer, RandomAccessFileReader* file,
288 const ImmutableCFOptions& ioptions,
289 const Comparator* comparator,
290 const BlockHandle& index_handle,
291 InternalIterator* meta_index_iter,
292 IndexReader** index_reader,
293 bool hash_index_allow_collision,
294 const PersistentCacheOptions& cache_options) {
295 std::unique_ptr<Block> index_block;
296 auto s = ReadBlockFromFile(
297 file, footer, ReadOptions(), index_handle, &index_block, ioptions,
298 true /* decompress */, Slice() /*compression dict*/, cache_options,
299 kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
300
301 if (!s.ok()) {
302 return s;
303 }
304
305 // Note, failure to create prefix hash index does not need to be a
306 // hard error. We can still fall back to the original binary search index.
307 // So, Create will succeed regardless, from this point on.
308
309 auto new_index_reader =
310 new HashIndexReader(comparator, std::move(index_block),
311 ioptions.statistics);
312 *index_reader = new_index_reader;
313
314 // Get prefixes block
315 BlockHandle prefixes_handle;
316 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
317 &prefixes_handle);
318 if (!s.ok()) {
319 // TODO: log error
320 return Status::OK();
321 }
322
323 // Get index metadata block
324 BlockHandle prefixes_meta_handle;
325 s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
326 &prefixes_meta_handle);
327 if (!s.ok()) {
328 // TODO: log error
329 return Status::OK();
330 }
331
332 // Read contents for the blocks
333 BlockContents prefixes_contents;
334 s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle,
335 &prefixes_contents, ioptions, true /* decompress */,
336 Slice() /*compression dict*/, cache_options);
337 if (!s.ok()) {
338 return s;
339 }
340 BlockContents prefixes_meta_contents;
341 s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle,
342 &prefixes_meta_contents, ioptions, true /* decompress */,
343 Slice() /*compression dict*/, cache_options);
344 if (!s.ok()) {
345 // TODO: log error
346 return Status::OK();
347 }
348
349 BlockPrefixIndex* prefix_index = nullptr;
350 s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data,
351 prefixes_meta_contents.data, &prefix_index);
352 // TODO: log error
353 if (s.ok()) {
354 new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index);
355 }
356
357 return Status::OK();
358 }
359
360 virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
361 bool total_order_seek = true) override {
362 return index_block_->NewIterator(comparator_, iter, total_order_seek);
363 }
364
365 virtual size_t size() const override { return index_block_->size(); }
366 virtual size_t usable_size() const override {
367 return index_block_->usable_size();
368 }
369
370 virtual size_t ApproximateMemoryUsage() const override {
371 assert(index_block_);
372 return index_block_->ApproximateMemoryUsage() +
373 prefixes_contents_.data.size();
374 }
375
376 private:
377 HashIndexReader(const Comparator* comparator,
378 std::unique_ptr<Block>&& index_block, Statistics* stats)
379 : IndexReader(comparator, stats), index_block_(std::move(index_block)) {
380 assert(index_block_ != nullptr);
381 }
382
383 ~HashIndexReader() {
384 }
385
386 std::unique_ptr<Block> index_block_;
387 BlockContents prefixes_contents_;
388};
389
390// Helper function to setup the cache key's prefix for the Table.
391void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
392 assert(kMaxCacheKeyPrefixSize >= 10);
393 rep->cache_key_prefix_size = 0;
394 rep->compressed_cache_key_prefix_size = 0;
395 if (rep->table_options.block_cache != nullptr) {
396 GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
397 &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
398 // Create dummy offset of index reader which is beyond the file size.
399 rep->dummy_index_reader_offset =
400 file_size + rep->table_options.block_cache->NewId();
401 }
402 if (rep->table_options.persistent_cache != nullptr) {
403 GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
404 &rep->persistent_cache_key_prefix[0],
405 &rep->persistent_cache_key_prefix_size);
406 }
407 if (rep->table_options.block_cache_compressed != nullptr) {
408 GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
409 rep->file->file(), &rep->compressed_cache_key_prefix[0],
410 &rep->compressed_cache_key_prefix_size);
411 }
412}
413
414void BlockBasedTable::GenerateCachePrefix(Cache* cc,
415 RandomAccessFile* file, char* buffer, size_t* size) {
416
417 // generate an id from the file
418 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
419
420 // If the prefix wasn't generated or was too long,
421 // create one from the cache.
422 if (cc && *size == 0) {
423 char* end = EncodeVarint64(buffer, cc->NewId());
424 *size = static_cast<size_t>(end - buffer);
425 }
426}
427
428void BlockBasedTable::GenerateCachePrefix(Cache* cc,
429 WritableFile* file, char* buffer, size_t* size) {
430
431 // generate an id from the file
432 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
433
434 // If the prefix wasn't generated or was too long,
435 // create one from the cache.
436 if (*size == 0) {
437 char* end = EncodeVarint64(buffer, cc->NewId());
438 *size = static_cast<size_t>(end - buffer);
439 }
440}
441
442namespace {
443// Return True if table_properties has `user_prop_name` has a `true` value
444// or it doesn't contain this property (for backward compatible).
445bool IsFeatureSupported(const TableProperties& table_properties,
446 const std::string& user_prop_name, Logger* info_log) {
447 auto& props = table_properties.user_collected_properties;
448 auto pos = props.find(user_prop_name);
449 // Older version doesn't have this value set. Skip this check.
450 if (pos != props.end()) {
451 if (pos->second == kPropFalse) {
452 return false;
453 } else if (pos->second != kPropTrue) {
454 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
455 user_prop_name.c_str(), pos->second.c_str());
456 }
457 }
458 return true;
459}
460
461SequenceNumber GetGlobalSequenceNumber(const TableProperties& table_properties,
462 Logger* info_log) {
463 auto& props = table_properties.user_collected_properties;
464
465 auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
466 auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
467
468 if (version_pos == props.end()) {
469 if (seqno_pos != props.end()) {
470 // This is not an external sst file, global_seqno is not supported.
471 assert(false);
472 ROCKS_LOG_ERROR(
473 info_log,
474 "A non-external sst file have global seqno property with value %s",
475 seqno_pos->second.c_str());
476 }
477 return kDisableGlobalSequenceNumber;
478 }
479
480 uint32_t version = DecodeFixed32(version_pos->second.c_str());
481 if (version < 2) {
482 if (seqno_pos != props.end() || version != 1) {
483 // This is a v1 external sst file, global_seqno is not supported.
484 assert(false);
485 ROCKS_LOG_ERROR(
486 info_log,
487 "An external sst file with version %u have global seqno property "
488 "with value %s",
489 version, seqno_pos->second.c_str());
490 }
491 return kDisableGlobalSequenceNumber;
492 }
493
494 SequenceNumber global_seqno = DecodeFixed64(seqno_pos->second.c_str());
495
496 if (global_seqno > kMaxSequenceNumber) {
497 assert(false);
498 ROCKS_LOG_ERROR(
499 info_log,
500 "An external sst file with version %u have global seqno property "
501 "with value %llu, which is greater than kMaxSequenceNumber",
502 version, global_seqno);
503 }
504
505 return global_seqno;
506}
507} // namespace
508
509Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
510 size_t cache_key_prefix_size,
511 const BlockHandle& handle, char* cache_key) {
512 assert(cache_key != nullptr);
513 assert(cache_key_prefix_size != 0);
514 assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
515 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
516 char* end =
517 EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
518 return Slice(cache_key, static_cast<size_t>(end - cache_key));
519}
520
521Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
522 const EnvOptions& env_options,
523 const BlockBasedTableOptions& table_options,
524 const InternalKeyComparator& internal_comparator,
525 unique_ptr<RandomAccessFileReader>&& file,
526 uint64_t file_size,
527 unique_ptr<TableReader>* table_reader,
528 const bool prefetch_index_and_filter_in_cache,
529 const bool skip_filters, const int level) {
530 table_reader->reset();
531
532 Footer footer;
533
534 // Before read footer, readahead backwards to prefetch data
535 Status s =
536 file->Prefetch((file_size < 512 * 1024 ? 0 : file_size - 512 * 1024),
537 512 * 1024 /* 512 KB prefetching */);
538 s = ReadFooterFromFile(file.get(), file_size, &footer,
539 kBlockBasedTableMagicNumber);
540 if (!s.ok()) {
541 return s;
542 }
543 if (!BlockBasedTableSupportedVersion(footer.version())) {
544 return Status::Corruption(
545 "Unknown Footer version. Maybe this file was created with newer "
546 "version of RocksDB?");
547 }
548
549 // We've successfully read the footer. We are ready to serve requests.
550 // Better not mutate rep_ after the creation. eg. internal_prefix_transform
551 // raw pointer will be used to create HashIndexReader, whose reset may
552 // access a dangling pointer.
553 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
554 internal_comparator, skip_filters);
555 rep->file = std::move(file);
556 rep->footer = footer;
557 rep->index_type = table_options.index_type;
558 rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
559 // We need to wrap data with internal_prefix_transform to make sure it can
560 // handle prefix correctly.
561 rep->internal_prefix_transform.reset(
562 new InternalKeySliceTransform(rep->ioptions.prefix_extractor));
563 SetupCacheKeyPrefix(rep, file_size);
564 unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
565
566 // page cache options
567 rep->persistent_cache_options =
568 PersistentCacheOptions(rep->table_options.persistent_cache,
569 std::string(rep->persistent_cache_key_prefix,
570 rep->persistent_cache_key_prefix_size),
571 rep->ioptions.statistics);
572
573 // Read meta index
574 std::unique_ptr<Block> meta;
575 std::unique_ptr<InternalIterator> meta_iter;
576 s = ReadMetaBlock(rep, &meta, &meta_iter);
577 if (!s.ok()) {
578 return s;
579 }
580
581 // Find filter handle and filter type
582 if (rep->filter_policy) {
583 for (auto filter_type :
584 {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
585 Rep::FilterType::kBlockFilter}) {
586 std::string prefix;
587 switch (filter_type) {
588 case Rep::FilterType::kFullFilter:
589 prefix = kFullFilterBlockPrefix;
590 break;
591 case Rep::FilterType::kPartitionedFilter:
592 prefix = kPartitionedFilterBlockPrefix;
593 break;
594 case Rep::FilterType::kBlockFilter:
595 prefix = kFilterBlockPrefix;
596 break;
597 default:
598 assert(0);
599 }
600 std::string filter_block_key = prefix;
601 filter_block_key.append(rep->filter_policy->Name());
602 if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle)
603 .ok()) {
604 rep->filter_type = filter_type;
605 break;
606 }
607 }
608 }
609
610 // Read the properties
611 bool found_properties_block = true;
612 s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
613
614 if (!s.ok()) {
615 ROCKS_LOG_WARN(rep->ioptions.info_log,
616 "Error when seeking to properties block from file: %s",
617 s.ToString().c_str());
618 } else if (found_properties_block) {
619 s = meta_iter->status();
620 TableProperties* table_properties = nullptr;
621 if (s.ok()) {
622 s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
623 rep->ioptions, &table_properties);
624 }
625
626 if (!s.ok()) {
627 ROCKS_LOG_WARN(rep->ioptions.info_log,
628 "Encountered error while reading data from properties "
629 "block %s",
630 s.ToString().c_str());
631 } else {
632 rep->table_properties.reset(table_properties);
633 }
634 } else {
635 ROCKS_LOG_ERROR(rep->ioptions.info_log,
636 "Cannot find Properties block from file.");
637 }
638
639 // Read the compression dictionary meta block
640 bool found_compression_dict;
641 s = SeekToCompressionDictBlock(meta_iter.get(), &found_compression_dict);
642 if (!s.ok()) {
643 ROCKS_LOG_WARN(
644 rep->ioptions.info_log,
645 "Error when seeking to compression dictionary block from file: %s",
646 s.ToString().c_str());
647 } else if (found_compression_dict) {
648 // TODO(andrewkr): Add to block cache if cache_index_and_filter_blocks is
649 // true.
650 unique_ptr<BlockContents> compression_dict_block{new BlockContents()};
651 // TODO(andrewkr): ReadMetaBlock repeats SeekToCompressionDictBlock().
652 // maybe decode a handle from meta_iter
653 // and do ReadBlockContents(handle) instead
654 s = rocksdb::ReadMetaBlock(rep->file.get(), file_size,
655 kBlockBasedTableMagicNumber, rep->ioptions,
656 rocksdb::kCompressionDictBlock,
657 compression_dict_block.get());
658 if (!s.ok()) {
659 ROCKS_LOG_WARN(
660 rep->ioptions.info_log,
661 "Encountered error while reading data from compression dictionary "
662 "block %s",
663 s.ToString().c_str());
664 } else {
665 rep->compression_dict_block = std::move(compression_dict_block);
666 }
667 }
668
669 // Read the range del meta block
670 bool found_range_del_block;
671 s = SeekToRangeDelBlock(meta_iter.get(), &found_range_del_block,
672 &rep->range_del_handle);
673 if (!s.ok()) {
674 ROCKS_LOG_WARN(
675 rep->ioptions.info_log,
676 "Error when seeking to range delete tombstones block from file: %s",
677 s.ToString().c_str());
678 } else {
679 if (found_range_del_block && !rep->range_del_handle.IsNull()) {
680 ReadOptions read_options;
681 s = MaybeLoadDataBlockToCache(rep, read_options, rep->range_del_handle,
682 Slice() /* compression_dict */,
683 &rep->range_del_entry);
684 if (!s.ok()) {
685 ROCKS_LOG_WARN(
686 rep->ioptions.info_log,
687 "Encountered error while reading data from range del block %s",
688 s.ToString().c_str());
689 }
690 }
691 }
692
693 // Determine whether whole key filtering is supported.
694 if (rep->table_properties) {
695 rep->whole_key_filtering &=
696 IsFeatureSupported(*(rep->table_properties),
697 BlockBasedTablePropertyNames::kWholeKeyFiltering,
698 rep->ioptions.info_log);
699 rep->prefix_filtering &= IsFeatureSupported(
700 *(rep->table_properties),
701 BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
702
703 rep->global_seqno = GetGlobalSequenceNumber(*(rep->table_properties),
704 rep->ioptions.info_log);
705 }
706
707 // pre-fetching of blocks is turned on
708 // Will use block cache for index/filter blocks access
709 // Always prefetch index and filter for level 0
710 if (table_options.cache_index_and_filter_blocks) {
711 if (prefetch_index_and_filter_in_cache || level == 0) {
712 assert(table_options.block_cache != nullptr);
713 // Hack: Call NewIndexIterator() to implicitly add index to the
714 // block_cache
715
716 // if pin_l0_filter_and_index_blocks_in_cache is true and this is
717 // a level0 file, then we will pass in this pointer to rep->index
718 // to NewIndexIterator(), which will save the index block in there
719 // else it's a nullptr and nothing special happens
720 CachableEntry<IndexReader>* index_entry = nullptr;
721 if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache &&
722 level == 0) {
723 index_entry = &rep->index_entry;
724 }
725 unique_ptr<InternalIterator> iter(
726 new_table->NewIndexIterator(ReadOptions(), nullptr, index_entry));
727 s = iter->status();
728
729 if (s.ok()) {
730 // Hack: Call GetFilter() to implicitly add filter to the block_cache
731 auto filter_entry = new_table->GetFilter();
732 // if pin_l0_filter_and_index_blocks_in_cache is true, and this is
733 // a level0 file, then save it in rep_->filter_entry; it will be
734 // released in the destructor only, hence it will be pinned in the
735 // cache while this reader is alive
736 if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache &&
737 level == 0) {
738 rep->filter_entry = filter_entry;
739 if (rep->filter_entry.value != nullptr) {
740 rep->filter_entry.value->SetLevel(level);
741 }
742 } else {
743 filter_entry.Release(table_options.block_cache.get());
744 }
745 }
746 }
747 } else {
748 // If we don't use block cache for index/filter blocks access, we'll
749 // pre-load these blocks, which will kept in member variables in Rep
750 // and with a same life-time as this table object.
751 IndexReader* index_reader = nullptr;
752 s = new_table->CreateIndexReader(&index_reader, meta_iter.get(), level);
753
754 if (s.ok()) {
755 rep->index_reader.reset(index_reader);
756
757 // Set filter block
758 if (rep->filter_policy) {
759 const bool is_a_filter_partition = true;
760 rep->filter.reset(
761 new_table->ReadFilter(rep->filter_handle, !is_a_filter_partition));
762 if (rep->filter.get()) {
763 rep->filter->SetLevel(level);
764 }
765 }
766 } else {
767 delete index_reader;
768 }
769 }
770
771 if (s.ok()) {
772 *table_reader = std::move(new_table);
773 }
774
775 return s;
776}
777
778void BlockBasedTable::SetupForCompaction() {
779 switch (rep_->ioptions.access_hint_on_compaction_start) {
780 case Options::NONE:
781 break;
782 case Options::NORMAL:
783 rep_->file->file()->Hint(RandomAccessFile::NORMAL);
784 break;
785 case Options::SEQUENTIAL:
786 rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
787 break;
788 case Options::WILLNEED:
789 rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
790 break;
791 default:
792 assert(false);
793 }
794 compaction_optimized_ = true;
795}
796
797std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
798 const {
799 return rep_->table_properties;
800}
801
802size_t BlockBasedTable::ApproximateMemoryUsage() const {
803 size_t usage = 0;
804 if (rep_->filter) {
805 usage += rep_->filter->ApproximateMemoryUsage();
806 }
807 if (rep_->index_reader) {
808 usage += rep_->index_reader->ApproximateMemoryUsage();
809 }
810 return usage;
811}
812
813// Load the meta-block from the file. On success, return the loaded meta block
814// and its iterator.
815Status BlockBasedTable::ReadMetaBlock(Rep* rep,
816 std::unique_ptr<Block>* meta_block,
817 std::unique_ptr<InternalIterator>* iter) {
818 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
819 // it is an empty block.
820 // TODO: we never really verify check sum for meta index block
821 std::unique_ptr<Block> meta;
822 Status s = ReadBlockFromFile(
823 rep->file.get(), rep->footer, ReadOptions(),
824 rep->footer.metaindex_handle(), &meta, rep->ioptions,
825 true /* decompress */, Slice() /*compression dict*/,
826 rep->persistent_cache_options, kDisableGlobalSequenceNumber,
827 0 /* read_amp_bytes_per_bit */);
828
829 if (!s.ok()) {
830 ROCKS_LOG_ERROR(rep->ioptions.info_log,
831 "Encountered error while reading data from properties"
832 " block %s",
833 s.ToString().c_str());
834 return s;
835 }
836
837 *meta_block = std::move(meta);
838 // meta block uses bytewise comparator.
839 iter->reset(meta_block->get()->NewIterator(BytewiseComparator()));
840 return Status::OK();
841}
842
843Status BlockBasedTable::GetDataBlockFromCache(
844 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
845 Cache* block_cache, Cache* block_cache_compressed,
846 const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
847 BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
848 const Slice& compression_dict, size_t read_amp_bytes_per_bit,
849 bool is_index) {
850 Status s;
851 Block* compressed_block = nullptr;
852 Cache::Handle* block_cache_compressed_handle = nullptr;
853 Statistics* statistics = ioptions.statistics;
854
855 // Lookup uncompressed cache first
856 if (block_cache != nullptr) {
857 block->cache_handle = GetEntryFromCache(
858 block_cache, block_cache_key,
859 is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
860 is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, statistics);
861 if (block->cache_handle != nullptr) {
862 block->value =
863 reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
864 return s;
865 }
866 }
867
868 // If not found, search from the compressed block cache.
869 assert(block->cache_handle == nullptr && block->value == nullptr);
870
871 if (block_cache_compressed == nullptr) {
872 return s;
873 }
874
875 assert(!compressed_block_cache_key.empty());
876 block_cache_compressed_handle =
877 block_cache_compressed->Lookup(compressed_block_cache_key);
878 // if we found in the compressed cache, then uncompress and insert into
879 // uncompressed cache
880 if (block_cache_compressed_handle == nullptr) {
881 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
882 return s;
883 }
884
885 // found compressed block
886 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
887 compressed_block = reinterpret_cast<Block*>(
888 block_cache_compressed->Value(block_cache_compressed_handle));
889 assert(compressed_block->compression_type() != kNoCompression);
890
891 // Retrieve the uncompressed contents into a new buffer
892 BlockContents contents;
893 s = UncompressBlockContents(compressed_block->data(),
894 compressed_block->size(), &contents,
895 format_version, compression_dict,
896 ioptions);
897
898 // Insert uncompressed block into block cache
899 if (s.ok()) {
900 block->value =
901 new Block(std::move(contents), compressed_block->global_seqno(),
902 read_amp_bytes_per_bit,
903 statistics); // uncompressed block
904 assert(block->value->compression_type() == kNoCompression);
905 if (block_cache != nullptr && block->value->cachable() &&
906 read_options.fill_cache) {
907 s = block_cache->Insert(
908 block_cache_key, block->value, block->value->usable_size(),
909 &DeleteCachedEntry<Block>, &(block->cache_handle));
910 if (s.ok()) {
911 RecordTick(statistics, BLOCK_CACHE_ADD);
912 if (is_index) {
913 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
914 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
915 block->value->usable_size());
916 } else {
917 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
918 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
919 block->value->usable_size());
920 }
921 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
922 block->value->usable_size());
923 } else {
924 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
925 delete block->value;
926 block->value = nullptr;
927 }
928 }
929 }
930
931 // Release hold on compressed cache entry
932 block_cache_compressed->Release(block_cache_compressed_handle);
933 return s;
934}
935
936Status BlockBasedTable::PutDataBlockToCache(
937 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
938 Cache* block_cache, Cache* block_cache_compressed,
939 const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
940 CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
941 const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
942 Cache::Priority priority) {
943 assert(raw_block->compression_type() == kNoCompression ||
944 block_cache_compressed != nullptr);
945
946 Status s;
947 // Retrieve the uncompressed contents into a new buffer
948 BlockContents contents;
949 Statistics* statistics = ioptions.statistics;
950 if (raw_block->compression_type() != kNoCompression) {
951 s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents,
952 format_version, compression_dict, ioptions);
953 }
954 if (!s.ok()) {
955 delete raw_block;
956 return s;
957 }
958
959 if (raw_block->compression_type() != kNoCompression) {
960 block->value = new Block(std::move(contents), raw_block->global_seqno(),
961 read_amp_bytes_per_bit,
962 statistics); // uncompressed block
963 } else {
964 block->value = raw_block;
965 raw_block = nullptr;
966 }
967
968 // Insert compressed block into compressed block cache.
969 // Release the hold on the compressed cache entry immediately.
970 if (block_cache_compressed != nullptr && raw_block != nullptr &&
971 raw_block->cachable()) {
972 s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block,
973 raw_block->usable_size(),
974 &DeleteCachedEntry<Block>);
975 if (s.ok()) {
976 // Avoid the following code to delete this cached block.
977 raw_block = nullptr;
978 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
979 } else {
980 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
981 }
982 }
983 delete raw_block;
984
985 // insert into uncompressed block cache
986 assert((block->value->compression_type() == kNoCompression));
987 if (block_cache != nullptr && block->value->cachable()) {
988 s = block_cache->Insert(
989 block_cache_key, block->value, block->value->usable_size(),
990 &DeleteCachedEntry<Block>, &(block->cache_handle), priority);
991 if (s.ok()) {
992 assert(block->cache_handle != nullptr);
993 RecordTick(statistics, BLOCK_CACHE_ADD);
994 if (is_index) {
995 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
996 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
997 block->value->usable_size());
998 } else {
999 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
1000 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
1001 block->value->usable_size());
1002 }
1003 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
1004 block->value->usable_size());
1005 assert(reinterpret_cast<Block*>(
1006 block_cache->Value(block->cache_handle)) == block->value);
1007 } else {
1008 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1009 delete block->value;
1010 block->value = nullptr;
1011 }
1012 }
1013
1014 return s;
1015}
1016
1017FilterBlockReader* BlockBasedTable::ReadFilter(
1018 const BlockHandle& filter_handle, const bool is_a_filter_partition) const {
1019 auto& rep = rep_;
1020 // TODO: We might want to unify with ReadBlockFromFile() if we start
1021 // requiring checksum verification in Table::Open.
1022 if (rep->filter_type == Rep::FilterType::kNoFilter) {
1023 return nullptr;
1024 }
1025 BlockContents block;
1026 if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
1027 filter_handle, &block, rep->ioptions,
1028 false /* decompress */, Slice() /*compression dict*/,
1029 rep->persistent_cache_options)
1030 .ok()) {
1031 // Error reading the block
1032 return nullptr;
1033 }
1034
1035 assert(rep->filter_policy);
1036
1037 auto filter_type = rep->filter_type;
1038 if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
1039 is_a_filter_partition) {
1040 filter_type = Rep::FilterType::kFullFilter;
1041 }
1042
1043 switch (filter_type) {
1044 case Rep::FilterType::kPartitionedFilter: {
1045 return new PartitionedFilterBlockReader(
1046 rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
1047 rep->whole_key_filtering, std::move(block), nullptr,
1048 rep->ioptions.statistics, rep->internal_comparator, this);
1049 }
1050
1051 case Rep::FilterType::kBlockFilter:
1052 return new BlockBasedFilterBlockReader(
1053 rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
1054 rep->table_options, rep->whole_key_filtering, std::move(block),
1055 rep->ioptions.statistics);
1056
1057 case Rep::FilterType::kFullFilter: {
1058 auto filter_bits_reader =
1059 rep->filter_policy->GetFilterBitsReader(block.data);
1060 assert(filter_bits_reader != nullptr);
1061 return new FullFilterBlockReader(
1062 rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
1063 rep->whole_key_filtering, std::move(block), filter_bits_reader,
1064 rep->ioptions.statistics);
1065 }
1066
1067 default:
1068 // filter_type is either kNoFilter (exited the function at the first if),
1069 // or it must be covered in this switch block
1070 assert(false);
1071 return nullptr;
1072 }
1073}
1074
1075BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1076 bool no_io) const {
1077 const BlockHandle& filter_blk_handle = rep_->filter_handle;
1078 const bool is_a_filter_partition = true;
1079 return GetFilter(filter_blk_handle, !is_a_filter_partition, no_io);
1080}
1081
1082BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1083 const BlockHandle& filter_blk_handle, const bool is_a_filter_partition,
1084 bool no_io) const {
1085 // If cache_index_and_filter_blocks is false, filter should be pre-populated.
1086 // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
1087 // read fails at Open() time. We don't want to reload again since it will
1088 // most probably fail again.
1089 if (!is_a_filter_partition &&
1090 !rep_->table_options.cache_index_and_filter_blocks) {
1091 return {rep_->filter.get(), nullptr /* cache handle */};
1092 }
1093
1094 Cache* block_cache = rep_->table_options.block_cache.get();
1095 if (rep_->filter_policy == nullptr /* do not use filter */ ||
1096 block_cache == nullptr /* no block cache at all */) {
1097 return {nullptr /* filter */, nullptr /* cache handle */};
1098 }
1099
1100 if (!is_a_filter_partition && rep_->filter_entry.IsSet()) {
1101 return rep_->filter_entry;
1102 }
1103
1104 PERF_TIMER_GUARD(read_filter_block_nanos);
1105
1106 // Fetching from the cache
1107 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1108 auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1109 filter_blk_handle, cache_key);
1110
1111 Statistics* statistics = rep_->ioptions.statistics;
1112 auto cache_handle =
1113 GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
1114 BLOCK_CACHE_FILTER_HIT, statistics);
1115
1116 FilterBlockReader* filter = nullptr;
1117 if (cache_handle != nullptr) {
1118 filter = reinterpret_cast<FilterBlockReader*>(
1119 block_cache->Value(cache_handle));
1120 } else if (no_io) {
1121 // Do not invoke any io.
1122 return CachableEntry<FilterBlockReader>();
1123 } else {
1124 filter = ReadFilter(filter_blk_handle, is_a_filter_partition);
1125 if (filter != nullptr) {
1126 assert(filter->size() > 0);
1127 Status s = block_cache->Insert(
1128 key, filter, filter->size(), &DeleteCachedFilterEntry, &cache_handle,
1129 rep_->table_options.cache_index_and_filter_blocks_with_high_priority
1130 ? Cache::Priority::HIGH
1131 : Cache::Priority::LOW);
1132 if (s.ok()) {
1133 RecordTick(statistics, BLOCK_CACHE_ADD);
1134 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
1135 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, filter->size());
1136 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter->size());
1137 } else {
1138 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1139 delete filter;
1140 return CachableEntry<FilterBlockReader>();
1141 }
1142 }
1143 }
1144
1145 return { filter, cache_handle };
1146}
1147
1148InternalIterator* BlockBasedTable::NewIndexIterator(
1149 const ReadOptions& read_options, BlockIter* input_iter,
1150 CachableEntry<IndexReader>* index_entry) {
1151 // index reader has already been pre-populated.
1152 if (rep_->index_reader) {
1153 return rep_->index_reader->NewIterator(
1154 input_iter, read_options.total_order_seek);
1155 }
1156 // we have a pinned index block
1157 if (rep_->index_entry.IsSet()) {
1158 return rep_->index_entry.value->NewIterator(input_iter,
1159 read_options.total_order_seek);
1160 }
1161
1162 PERF_TIMER_GUARD(read_index_block_nanos);
1163
1164 const bool no_io = read_options.read_tier == kBlockCacheTier;
1165 Cache* block_cache = rep_->table_options.block_cache.get();
1166 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1167 auto key =
1168 GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1169 rep_->dummy_index_reader_offset, cache_key);
1170 Statistics* statistics = rep_->ioptions.statistics;
1171 auto cache_handle =
1172 GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
1173 BLOCK_CACHE_INDEX_HIT, statistics);
1174
1175 if (cache_handle == nullptr && no_io) {
1176 if (input_iter != nullptr) {
1177 input_iter->SetStatus(Status::Incomplete("no blocking io"));
1178 return input_iter;
1179 } else {
1180 return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
1181 }
1182 }
1183
1184 IndexReader* index_reader = nullptr;
1185 if (cache_handle != nullptr) {
1186 index_reader =
1187 reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
1188 } else {
1189 // Create index reader and put it in the cache.
1190 Status s;
1191 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2");
1192 s = CreateIndexReader(&index_reader);
1193 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
1194 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
1195 TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
1196 if (s.ok()) {
1197 assert(index_reader != nullptr);
1198 s = block_cache->Insert(
1199 key, index_reader, index_reader->usable_size(),
1200 &DeleteCachedIndexEntry, &cache_handle,
1201 rep_->table_options.cache_index_and_filter_blocks_with_high_priority
1202 ? Cache::Priority::HIGH
1203 : Cache::Priority::LOW);
1204 }
1205
1206 if (s.ok()) {
1207 size_t usable_size = index_reader->usable_size();
1208 RecordTick(statistics, BLOCK_CACHE_ADD);
1209 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
1210 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usable_size);
1211 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usable_size);
1212 } else {
1213 if (index_reader != nullptr) {
1214 delete index_reader;
1215 }
1216 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1217 // make sure if something goes wrong, index_reader shall remain intact.
1218 if (input_iter != nullptr) {
1219 input_iter->SetStatus(s);
1220 return input_iter;
1221 } else {
1222 return NewErrorInternalIterator(s);
1223 }
1224 }
1225
1226 }
1227
1228 assert(cache_handle);
1229 auto* iter = index_reader->NewIterator(
1230 input_iter, read_options.total_order_seek);
1231
1232 // the caller would like to take ownership of the index block
1233 // don't call RegisterCleanup() in this case, the caller will take care of it
1234 if (index_entry != nullptr) {
1235 *index_entry = {index_reader, cache_handle};
1236 } else {
1237 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
1238 }
1239
1240 return iter;
1241}
1242
1243InternalIterator* BlockBasedTable::NewDataBlockIterator(
1244 Rep* rep, const ReadOptions& ro, const Slice& index_value,
1245 BlockIter* input_iter, bool is_index) {
1246 BlockHandle handle;
1247 Slice input = index_value;
1248 // We intentionally allow extra stuff in index_value so that we
1249 // can add more features in the future.
1250 Status s = handle.DecodeFrom(&input);
1251 return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, s);
1252}
1253
1254// Convert an index iterator value (i.e., an encoded BlockHandle)
1255// into an iterator over the contents of the corresponding block.
1256// If input_iter is null, new a iterator
1257// If input_iter is not null, update this iter and return it
1258InternalIterator* BlockBasedTable::NewDataBlockIterator(
1259 Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
1260 BlockIter* input_iter, bool is_index, Status s) {
1261 PERF_TIMER_GUARD(new_table_block_iter_nanos);
1262
1263 const bool no_io = (ro.read_tier == kBlockCacheTier);
1264 Cache* block_cache = rep->table_options.block_cache.get();
1265 CachableEntry<Block> block;
1266 Slice compression_dict;
1267 if (s.ok()) {
1268 if (rep->compression_dict_block) {
1269 compression_dict = rep->compression_dict_block->data;
1270 }
1271 s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block,
1272 is_index);
1273 }
1274
1275 // Didn't get any data from block caches.
1276 if (s.ok() && block.value == nullptr) {
1277 if (no_io) {
1278 // Could not read from block_cache and can't do IO
1279 if (input_iter != nullptr) {
1280 input_iter->SetStatus(Status::Incomplete("no blocking io"));
1281 return input_iter;
1282 } else {
1283 return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
1284 }
1285 }
1286 std::unique_ptr<Block> block_value;
1287 s = ReadBlockFromFile(
1288 rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions,
1289 true /* compress */, compression_dict, rep->persistent_cache_options,
1290 rep->global_seqno, rep->table_options.read_amp_bytes_per_bit);
1291 if (s.ok()) {
1292 block.value = block_value.release();
1293 }
1294 }
1295
1296 InternalIterator* iter;
1297 if (s.ok()) {
1298 assert(block.value != nullptr);
1299 iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true,
1300 rep->ioptions.statistics);
1301 if (block.cache_handle != nullptr) {
1302 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
1303 block.cache_handle);
1304 } else {
1305 iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
1306 }
1307 } else {
1308 assert(block.value == nullptr);
1309 if (input_iter != nullptr) {
1310 input_iter->SetStatus(s);
1311 iter = input_iter;
1312 } else {
1313 iter = NewErrorInternalIterator(s);
1314 }
1315 }
1316 return iter;
1317}
1318
1319Status BlockBasedTable::MaybeLoadDataBlockToCache(
1320 Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
1321 Slice compression_dict, CachableEntry<Block>* block_entry, bool is_index) {
1322 const bool no_io = (ro.read_tier == kBlockCacheTier);
1323 Cache* block_cache = rep->table_options.block_cache.get();
1324 Cache* block_cache_compressed =
1325 rep->table_options.block_cache_compressed.get();
1326
1327 // If either block cache is enabled, we'll try to read from it.
1328 Status s;
1329 if (block_cache != nullptr || block_cache_compressed != nullptr) {
1330 Statistics* statistics = rep->ioptions.statistics;
1331 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1332 char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1333 Slice key, /* key to the block cache */
1334 ckey /* key to the compressed block cache */;
1335
1336 // create key for block cache
1337 if (block_cache != nullptr) {
1338 key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
1339 handle, cache_key);
1340 }
1341
1342 if (block_cache_compressed != nullptr) {
1343 ckey = GetCacheKey(rep->compressed_cache_key_prefix,
1344 rep->compressed_cache_key_prefix_size, handle,
1345 compressed_cache_key);
1346 }
1347
1348 s = GetDataBlockFromCache(
1349 key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
1350 block_entry, rep->table_options.format_version, compression_dict,
1351 rep->table_options.read_amp_bytes_per_bit, is_index);
1352
1353 if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
1354 std::unique_ptr<Block> raw_block;
1355 {
1356 StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
1357 s = ReadBlockFromFile(
1358 rep->file.get(), rep->footer, ro, handle, &raw_block, rep->ioptions,
1359 block_cache_compressed == nullptr, compression_dict,
1360 rep->persistent_cache_options, rep->global_seqno,
1361 rep->table_options.read_amp_bytes_per_bit);
1362 }
1363
1364 if (s.ok()) {
1365 s = PutDataBlockToCache(
1366 key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
1367 block_entry, raw_block.release(), rep->table_options.format_version,
1368 compression_dict, rep->table_options.read_amp_bytes_per_bit,
1369 is_index,
1370 is_index &&
1371 rep->table_options
1372 .cache_index_and_filter_blocks_with_high_priority
1373 ? Cache::Priority::HIGH
1374 : Cache::Priority::LOW);
1375 }
1376 }
1377 }
1378 return s;
1379}
1380
1381BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState(
1382 BlockBasedTable* table, const ReadOptions& read_options, bool skip_filters,
1383 bool is_index, Cleanable* block_cache_cleaner)
1384 : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr),
1385 table_(table),
1386 read_options_(read_options),
1387 skip_filters_(skip_filters),
1388 is_index_(is_index),
1389 block_cache_cleaner_(block_cache_cleaner) {}
1390
1391InternalIterator*
1392BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator(
1393 const Slice& index_value) {
1394 // Return a block iterator on the index partition
1395 BlockHandle handle;
1396 Slice input = index_value;
1397 Status s = handle.DecodeFrom(&input);
1398 auto iter = NewDataBlockIterator(table_->rep_, read_options_, handle, nullptr,
1399 is_index_, s);
1400 if (block_cache_cleaner_) {
1401 uint64_t offset = handle.offset();
1402 {
1403 ReadLock rl(&cleaner_mu);
1404 if (cleaner_set.find(offset) != cleaner_set.end()) {
1405 // already have a refernce to the block cache objects
1406 return iter;
1407 }
1408 }
1409 WriteLock wl(&cleaner_mu);
1410 cleaner_set.insert(offset);
1411 // Keep the data into cache until the cleaner cleansup
1412 iter->DelegateCleanupsTo(block_cache_cleaner_);
1413 }
1414 return iter;
1415}
1416
1417bool BlockBasedTable::BlockEntryIteratorState::PrefixMayMatch(
1418 const Slice& internal_key) {
1419 if (read_options_.total_order_seek || skip_filters_) {
1420 return true;
1421 }
1422 return table_->PrefixMayMatch(internal_key);
1423}
1424
1425// This will be broken if the user specifies an unusual implementation
1426// of Options.comparator, or if the user specifies an unusual
1427// definition of prefixes in BlockBasedTableOptions.filter_policy.
1428// In particular, we require the following three properties:
1429//
1430// 1) key.starts_with(prefix(key))
1431// 2) Compare(prefix(key), key) <= 0.
1432// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
1433//
1434// Otherwise, this method guarantees no I/O will be incurred.
1435//
1436// REQUIRES: this method shouldn't be called while the DB lock is held.
1437bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
1438 if (!rep_->filter_policy) {
1439 return true;
1440 }
1441
1442 assert(rep_->ioptions.prefix_extractor != nullptr);
1443 auto user_key = ExtractUserKey(internal_key);
1444 if (!rep_->ioptions.prefix_extractor->InDomain(user_key) ||
1445 rep_->table_properties->prefix_extractor_name.compare(
1446 rep_->ioptions.prefix_extractor->Name()) != 0) {
1447 return true;
1448 }
1449 auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key);
1450
1451 bool may_match = true;
1452 Status s;
1453
1454 // First, try check with full filter
1455 const bool no_io = true;
1456 auto filter_entry = GetFilter(no_io);
1457 FilterBlockReader* filter = filter_entry.value;
1458 if (filter != nullptr) {
1459 if (!filter->IsBlockBased()) {
1460 const Slice* const const_ikey_ptr = &internal_key;
1461 may_match =
1462 filter->PrefixMayMatch(prefix, kNotValid, no_io, const_ikey_ptr);
1463 } else {
1464 InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
1465 auto internal_prefix = internal_key_prefix.Encode();
1466
1467 // To prevent any io operation in this method, we set `read_tier` to make
1468 // sure we always read index or filter only when they have already been
1469 // loaded to memory.
1470 ReadOptions no_io_read_options;
1471 no_io_read_options.read_tier = kBlockCacheTier;
1472
1473 // Then, try find it within each block
1474 unique_ptr<InternalIterator> iiter(NewIndexIterator(no_io_read_options));
1475 iiter->Seek(internal_prefix);
1476
1477 if (!iiter->Valid()) {
1478 // we're past end of file
1479 // if it's incomplete, it means that we avoided I/O
1480 // and we're not really sure that we're past the end
1481 // of the file
1482 may_match = iiter->status().IsIncomplete();
1483 } else if (ExtractUserKey(iiter->key())
1484 .starts_with(ExtractUserKey(internal_prefix))) {
1485 // we need to check for this subtle case because our only
1486 // guarantee is that "the key is a string >= last key in that data
1487 // block" according to the doc/table_format.txt spec.
1488 //
1489 // Suppose iiter->key() starts with the desired prefix; it is not
1490 // necessarily the case that the corresponding data block will
1491 // contain the prefix, since iiter->key() need not be in the
1492 // block. However, the next data block may contain the prefix, so
1493 // we return true to play it safe.
1494 may_match = true;
1495 } else if (filter->IsBlockBased()) {
1496 // iiter->key() does NOT start with the desired prefix. Because
1497 // Seek() finds the first key that is >= the seek target, this
1498 // means that iiter->key() > prefix. Thus, any data blocks coming
1499 // after the data block corresponding to iiter->key() cannot
1500 // possibly contain the key. Thus, the corresponding data block
1501 // is the only on could potentially contain the prefix.
1502 Slice handle_value = iiter->value();
1503 BlockHandle handle;
1504 s = handle.DecodeFrom(&handle_value);
1505 assert(s.ok());
1506 may_match = filter->PrefixMayMatch(prefix, handle.offset());
1507 }
1508 }
1509 }
1510
1511 Statistics* statistics = rep_->ioptions.statistics;
1512 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
1513 if (!may_match) {
1514 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
1515 }
1516
1517 // if rep_->filter_entry is not set, we should call Release(); otherwise
1518 // don't call, in this case we have a local copy in rep_->filter_entry,
1519 // it's pinned to the cache and will be released in the destructor
1520 if (!rep_->filter_entry.IsSet()) {
1521 filter_entry.Release(rep_->table_options.block_cache.get());
1522 }
1523
1524 return may_match;
1525}
1526
1527InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
1528 Arena* arena,
1529 bool skip_filters) {
1530 return NewTwoLevelIterator(
1531 new BlockEntryIteratorState(this, read_options, skip_filters),
1532 NewIndexIterator(read_options), arena);
1533}
1534
1535InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
1536 const ReadOptions& read_options) {
1537 if (rep_->range_del_handle.IsNull()) {
1538 // The block didn't exist, nullptr indicates no range tombstones.
1539 return nullptr;
1540 }
1541 if (rep_->range_del_entry.cache_handle != nullptr) {
1542 // We have a handle to an uncompressed block cache entry that's held for
1543 // this table's lifetime. Increment its refcount before returning an
1544 // iterator based on it since the returned iterator may outlive this table
1545 // reader.
1546 assert(rep_->range_del_entry.value != nullptr);
1547 Cache* block_cache = rep_->table_options.block_cache.get();
1548 assert(block_cache != nullptr);
1549 if (block_cache->Ref(rep_->range_del_entry.cache_handle)) {
1550 auto iter = rep_->range_del_entry.value->NewIterator(
1551 &rep_->internal_comparator, nullptr /* iter */,
1552 true /* total_order_seek */, rep_->ioptions.statistics);
1553 iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
1554 rep_->range_del_entry.cache_handle);
1555 return iter;
1556 }
1557 }
1558 std::string str;
1559 rep_->range_del_handle.EncodeTo(&str);
1560 // The meta-block exists but isn't in uncompressed block cache (maybe because
1561 // it is disabled), so go through the full lookup process.
1562 return NewDataBlockIterator(rep_, read_options, Slice(str));
1563}
1564
1565bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options,
1566 FilterBlockReader* filter,
1567 const Slice& internal_key,
1568 const bool no_io) const {
1569 if (filter == nullptr || filter->IsBlockBased()) {
1570 return true;
1571 }
1572 Slice user_key = ExtractUserKey(internal_key);
1573 const Slice* const const_ikey_ptr = &internal_key;
1574 if (filter->whole_key_filtering()) {
1575 return filter->KeyMayMatch(user_key, kNotValid, no_io, const_ikey_ptr);
1576 }
1577 if (!read_options.total_order_seek && rep_->ioptions.prefix_extractor &&
1578 rep_->table_properties->prefix_extractor_name.compare(
1579 rep_->ioptions.prefix_extractor->Name()) == 0 &&
1580 rep_->ioptions.prefix_extractor->InDomain(user_key) &&
1581 !filter->PrefixMayMatch(
1582 rep_->ioptions.prefix_extractor->Transform(user_key), kNotValid,
1583 false, const_ikey_ptr)) {
1584 return false;
1585 }
1586 return true;
1587}
1588
1589Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
1590 GetContext* get_context, bool skip_filters) {
1591 Status s;
1592 const bool no_io = read_options.read_tier == kBlockCacheTier;
1593 CachableEntry<FilterBlockReader> filter_entry;
1594 if (!skip_filters) {
1595 filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
1596 }
1597 FilterBlockReader* filter = filter_entry.value;
1598
1599 // First check the full filter
1600 // If full filter not useful, Then go into each block
1601 if (!FullFilterKeyMayMatch(read_options, filter, key, no_io)) {
1602 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
1603 } else {
1604 BlockIter iiter_on_stack;
1605 auto iiter = NewIndexIterator(read_options, &iiter_on_stack);
1606 std::unique_ptr<InternalIterator> iiter_unique_ptr;
1607 if (iiter != &iiter_on_stack) {
1608 iiter_unique_ptr.reset(iiter);
1609 }
1610
1611 bool done = false;
1612 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
1613 Slice handle_value = iiter->value();
1614
1615 BlockHandle handle;
1616 bool not_exist_in_filter =
1617 filter != nullptr && filter->IsBlockBased() == true &&
1618 handle.DecodeFrom(&handle_value).ok() &&
1619 !filter->KeyMayMatch(ExtractUserKey(key), handle.offset(), no_io);
1620
1621 if (not_exist_in_filter) {
1622 // Not found
1623 // TODO: think about interaction with Merge. If a user key cannot
1624 // cross one data block, we should be fine.
1625 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
1626 break;
1627 } else {
1628 BlockIter biter;
1629 NewDataBlockIterator(rep_, read_options, iiter->value(), &biter);
1630
1631 if (read_options.read_tier == kBlockCacheTier &&
1632 biter.status().IsIncomplete()) {
1633 // couldn't get block from block_cache
1634 // Update Saver.state to Found because we are only looking for whether
1635 // we can guarantee the key is not there when "no_io" is set
1636 get_context->MarkKeyMayExist();
1637 break;
1638 }
1639 if (!biter.status().ok()) {
1640 s = biter.status();
1641 break;
1642 }
1643
1644 // Call the *saver function on each entry/block until it returns false
1645 for (biter.Seek(key); biter.Valid(); biter.Next()) {
1646 ParsedInternalKey parsed_key;
1647 if (!ParseInternalKey(biter.key(), &parsed_key)) {
1648 s = Status::Corruption(Slice());
1649 }
1650
1651 if (!get_context->SaveValue(parsed_key, biter.value(), &biter)) {
1652 done = true;
1653 break;
1654 }
1655 }
1656 s = biter.status();
1657 }
1658 if (done) {
1659 // Avoid the extra Next which is expensive in two-level indexes
1660 break;
1661 }
1662 }
1663 if (s.ok()) {
1664 s = iiter->status();
1665 }
1666 }
1667
1668 // if rep_->filter_entry is not set, we should call Release(); otherwise
1669 // don't call, in this case we have a local copy in rep_->filter_entry,
1670 // it's pinned to the cache and will be released in the destructor
1671 if (!rep_->filter_entry.IsSet()) {
1672 filter_entry.Release(rep_->table_options.block_cache.get());
1673 }
1674 return s;
1675}
1676
1677Status BlockBasedTable::Prefetch(const Slice* const begin,
1678 const Slice* const end) {
1679 auto& comparator = rep_->internal_comparator;
1680 // pre-condition
1681 if (begin && end && comparator.Compare(*begin, *end) > 0) {
1682 return Status::InvalidArgument(*begin, *end);
1683 }
1684
1685 BlockIter iiter_on_stack;
1686 auto iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack);
1687 std::unique_ptr<InternalIterator> iiter_unique_ptr;
1688 if (iiter != &iiter_on_stack) {
1689 iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
1690 }
1691
1692 if (!iiter->status().ok()) {
1693 // error opening index iterator
1694 return iiter->status();
1695 }
1696
1697 // indicates if we are on the last page that need to be pre-fetched
1698 bool prefetching_boundary_page = false;
1699
1700 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
1701 iiter->Next()) {
1702 Slice block_handle = iiter->value();
1703
1704 if (end && comparator.Compare(iiter->key(), *end) >= 0) {
1705 if (prefetching_boundary_page) {
1706 break;
1707 }
1708
1709 // The index entry represents the last key in the data block.
1710 // We should load this page into memory as well, but no more
1711 prefetching_boundary_page = true;
1712 }
1713
1714 // Load the block specified by the block_handle into the block cache
1715 BlockIter biter;
1716 NewDataBlockIterator(rep_, ReadOptions(), block_handle, &biter);
1717
1718 if (!biter.status().ok()) {
1719 // there was an unexpected error while pre-fetching
1720 return biter.status();
1721 }
1722 }
1723
1724 return Status::OK();
1725}
1726
1727bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
1728 const Slice& key) {
1729 std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
1730 iiter->Seek(key);
1731 assert(iiter->Valid());
1732 CachableEntry<Block> block;
1733
1734 BlockHandle handle;
1735 Slice input = iiter->value();
1736 Status s = handle.DecodeFrom(&input);
1737 assert(s.ok());
1738 Cache* block_cache = rep_->table_options.block_cache.get();
1739 assert(block_cache != nullptr);
1740
1741 char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1742 Slice cache_key =
1743 GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1744 handle, cache_key_storage);
1745 Slice ckey;
1746
1747 s = GetDataBlockFromCache(
1748 cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
1749 rep_->table_options.format_version,
1750 rep_->compression_dict_block ? rep_->compression_dict_block->data
1751 : Slice(),
1752 0 /* read_amp_bytes_per_bit */);
1753 assert(s.ok());
1754 bool in_cache = block.value != nullptr;
1755 if (in_cache) {
1756 ReleaseCachedEntry(block_cache, block.cache_handle);
1757 }
1758 return in_cache;
1759}
1760
1761// REQUIRES: The following fields of rep_ should have already been populated:
1762// 1. file
1763// 2. index_handle,
1764// 3. options
1765// 4. internal_comparator
1766// 5. index_type
1767Status BlockBasedTable::CreateIndexReader(
1768 IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter,
1769 int level) {
1770 // Some old version of block-based tables don't have index type present in
1771 // table properties. If that's the case we can safely use the kBinarySearch.
1772 auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
1773 if (rep_->table_properties) {
1774 auto& props = rep_->table_properties->user_collected_properties;
1775 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
1776 if (pos != props.end()) {
1777 index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
1778 DecodeFixed32(pos->second.c_str()));
1779 }
1780 }
1781
1782 auto file = rep_->file.get();
1783 auto comparator = &rep_->internal_comparator;
1784 const Footer& footer = rep_->footer;
1785 if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
1786 rep_->ioptions.prefix_extractor == nullptr) {
1787 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1788 "BlockBasedTableOptions::kHashSearch requires "
1789 "options.prefix_extractor to be set."
1790 " Fall back to binary search index.");
1791 index_type_on_file = BlockBasedTableOptions::kBinarySearch;
1792 }
1793
1794 switch (index_type_on_file) {
1795 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
1796 return PartitionIndexReader::Create(
1797 this, file, footer, footer.index_handle(), rep_->ioptions, comparator,
1798 index_reader, rep_->persistent_cache_options, level);
1799 }
1800 case BlockBasedTableOptions::kBinarySearch: {
1801 return BinarySearchIndexReader::Create(
1802 file, footer, footer.index_handle(), rep_->ioptions, comparator,
1803 index_reader, rep_->persistent_cache_options);
1804 }
1805 case BlockBasedTableOptions::kHashSearch: {
1806 std::unique_ptr<Block> meta_guard;
1807 std::unique_ptr<InternalIterator> meta_iter_guard;
1808 auto meta_index_iter = preloaded_meta_index_iter;
1809 if (meta_index_iter == nullptr) {
1810 auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard);
1811 if (!s.ok()) {
1812 // we simply fall back to binary search in case there is any
1813 // problem with prefix hash index loading.
1814 ROCKS_LOG_WARN(rep_->ioptions.info_log,
1815 "Unable to read the metaindex block."
1816 " Fall back to binary search index.");
1817 return BinarySearchIndexReader::Create(
1818 file, footer, footer.index_handle(), rep_->ioptions, comparator,
1819 index_reader, rep_->persistent_cache_options);
1820 }
1821 meta_index_iter = meta_iter_guard.get();
1822 }
1823
1824 return HashIndexReader::Create(
1825 rep_->internal_prefix_transform.get(), footer, file, rep_->ioptions,
1826 comparator, footer.index_handle(), meta_index_iter, index_reader,
1827 rep_->hash_index_allow_collision, rep_->persistent_cache_options);
1828 }
1829 default: {
1830 std::string error_message =
1831 "Unrecognized index type: " + ToString(rep_->index_type);
1832 return Status::InvalidArgument(error_message.c_str());
1833 }
1834 }
1835}
1836
1837uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
1838 unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions()));
1839
1840 index_iter->Seek(key);
1841 uint64_t result;
1842 if (index_iter->Valid()) {
1843 BlockHandle handle;
1844 Slice input = index_iter->value();
1845 Status s = handle.DecodeFrom(&input);
1846 if (s.ok()) {
1847 result = handle.offset();
1848 } else {
1849 // Strange: we can't decode the block handle in the index block.
1850 // We'll just return the offset of the metaindex block, which is
1851 // close to the whole file size for this case.
1852 result = rep_->footer.metaindex_handle().offset();
1853 }
1854 } else {
1855 // key is past the last key in the file. If table_properties is not
1856 // available, approximate the offset by returning the offset of the
1857 // metaindex block (which is right near the end of the file).
1858 result = 0;
1859 if (rep_->table_properties) {
1860 result = rep_->table_properties->data_size;
1861 }
1862 // table_properties is not present in the table.
1863 if (result == 0) {
1864 result = rep_->footer.metaindex_handle().offset();
1865 }
1866 }
1867 return result;
1868}
1869
1870bool BlockBasedTable::TEST_filter_block_preloaded() const {
1871 return rep_->filter != nullptr;
1872}
1873
1874bool BlockBasedTable::TEST_index_reader_preloaded() const {
1875 return rep_->index_reader != nullptr;
1876}
1877
1878Status BlockBasedTable::GetKVPairsFromDataBlocks(
1879 std::vector<KVPairBlock>* kv_pair_blocks) {
1880 std::unique_ptr<InternalIterator> blockhandles_iter(
1881 NewIndexIterator(ReadOptions()));
1882
1883 Status s = blockhandles_iter->status();
1884 if (!s.ok()) {
1885 // Cannot read Index Block
1886 return s;
1887 }
1888
1889 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
1890 blockhandles_iter->Next()) {
1891 s = blockhandles_iter->status();
1892
1893 if (!s.ok()) {
1894 break;
1895 }
1896
1897 std::unique_ptr<InternalIterator> datablock_iter;
1898 datablock_iter.reset(
1899 NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
1900 s = datablock_iter->status();
1901
1902 if (!s.ok()) {
1903 // Error reading the block - Skipped
1904 continue;
1905 }
1906
1907 KVPairBlock kv_pair_block;
1908 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
1909 datablock_iter->Next()) {
1910 s = datablock_iter->status();
1911 if (!s.ok()) {
1912 // Error reading the block - Skipped
1913 break;
1914 }
1915 const Slice& key = datablock_iter->key();
1916 const Slice& value = datablock_iter->value();
1917 std::string key_copy = std::string(key.data(), key.size());
1918 std::string value_copy = std::string(value.data(), value.size());
1919
1920 kv_pair_block.push_back(
1921 std::make_pair(std::move(key_copy), std::move(value_copy)));
1922 }
1923 kv_pair_blocks->push_back(std::move(kv_pair_block));
1924 }
1925 return Status::OK();
1926}
1927
1928Status BlockBasedTable::DumpTable(WritableFile* out_file) {
1929 // Output Footer
1930 out_file->Append(
1931 "Footer Details:\n"
1932 "--------------------------------------\n"
1933 " ");
1934 out_file->Append(rep_->footer.ToString().c_str());
1935 out_file->Append("\n");
1936
1937 // Output MetaIndex
1938 out_file->Append(
1939 "Metaindex Details:\n"
1940 "--------------------------------------\n");
1941 std::unique_ptr<Block> meta;
1942 std::unique_ptr<InternalIterator> meta_iter;
1943 Status s = ReadMetaBlock(rep_, &meta, &meta_iter);
1944 if (s.ok()) {
1945 for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
1946 s = meta_iter->status();
1947 if (!s.ok()) {
1948 return s;
1949 }
1950 if (meta_iter->key() == rocksdb::kPropertiesBlock) {
1951 out_file->Append(" Properties block handle: ");
1952 out_file->Append(meta_iter->value().ToString(true).c_str());
1953 out_file->Append("\n");
1954 } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
1955 out_file->Append(" Compression dictionary block handle: ");
1956 out_file->Append(meta_iter->value().ToString(true).c_str());
1957 out_file->Append("\n");
1958 } else if (strstr(meta_iter->key().ToString().c_str(),
1959 "filter.rocksdb.") != nullptr) {
1960 out_file->Append(" Filter block handle: ");
1961 out_file->Append(meta_iter->value().ToString(true).c_str());
1962 out_file->Append("\n");
1963 } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
1964 out_file->Append(" Range deletion block handle: ");
1965 out_file->Append(meta_iter->value().ToString(true).c_str());
1966 out_file->Append("\n");
1967 }
1968 }
1969 out_file->Append("\n");
1970 } else {
1971 return s;
1972 }
1973
1974 // Output TableProperties
1975 const rocksdb::TableProperties* table_properties;
1976 table_properties = rep_->table_properties.get();
1977
1978 if (table_properties != nullptr) {
1979 out_file->Append(
1980 "Table Properties:\n"
1981 "--------------------------------------\n"
1982 " ");
1983 out_file->Append(table_properties->ToString("\n ", ": ").c_str());
1984 out_file->Append("\n");
1985 }
1986
1987 // Output Filter blocks
1988 if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
1989 // Support only BloomFilter as off now
1990 rocksdb::BlockBasedTableOptions table_options;
1991 table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
1992 if (table_properties->filter_policy_name.compare(
1993 table_options.filter_policy->Name()) == 0) {
1994 std::string filter_block_key = kFilterBlockPrefix;
1995 filter_block_key.append(table_properties->filter_policy_name);
1996 BlockHandle handle;
1997 if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
1998 BlockContents block;
1999 if (ReadBlockContents(
2000 rep_->file.get(), rep_->footer, ReadOptions(), handle, &block,
2001 rep_->ioptions, false /*decompress*/,
2002 Slice() /*compression dict*/, rep_->persistent_cache_options)
2003 .ok()) {
2004 rep_->filter.reset(new BlockBasedFilterBlockReader(
2005 rep_->ioptions.prefix_extractor, table_options,
2006 table_options.whole_key_filtering, std::move(block),
2007 rep_->ioptions.statistics));
2008 }
2009 }
2010 }
2011 }
2012 if (rep_->filter) {
2013 out_file->Append(
2014 "Filter Details:\n"
2015 "--------------------------------------\n"
2016 " ");
2017 out_file->Append(rep_->filter->ToString().c_str());
2018 out_file->Append("\n");
2019 }
2020
2021 // Output Index block
2022 s = DumpIndexBlock(out_file);
2023 if (!s.ok()) {
2024 return s;
2025 }
2026
2027 // Output compression dictionary
2028 if (rep_->compression_dict_block != nullptr) {
2029 auto compression_dict = rep_->compression_dict_block->data;
2030 out_file->Append(
2031 "Compression Dictionary:\n"
2032 "--------------------------------------\n");
2033 out_file->Append(" size (bytes): ");
2034 out_file->Append(rocksdb::ToString(compression_dict.size()));
2035 out_file->Append("\n\n");
2036 out_file->Append(" HEX ");
2037 out_file->Append(compression_dict.ToString(true).c_str());
2038 out_file->Append("\n\n");
2039 }
2040
2041 // Output range deletions block
2042 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
2043 if (range_del_iter != nullptr) {
2044 range_del_iter->SeekToFirst();
2045 if (range_del_iter->Valid()) {
2046 out_file->Append(
2047 "Range deletions:\n"
2048 "--------------------------------------\n"
2049 " ");
2050 for (; range_del_iter->Valid(); range_del_iter->Next()) {
2051 DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
2052 }
2053 out_file->Append("\n");
2054 }
2055 delete range_del_iter;
2056 }
2057 // Output Data blocks
2058 s = DumpDataBlocks(out_file);
2059
2060 return s;
2061}
2062
2063void BlockBasedTable::Close() {
2064 rep_->filter_entry.Release(rep_->table_options.block_cache.get());
2065 rep_->index_entry.Release(rep_->table_options.block_cache.get());
2066 rep_->range_del_entry.Release(rep_->table_options.block_cache.get());
2067 // cleanup index and filter blocks to avoid accessing dangling pointer
2068 if (!rep_->table_options.no_block_cache) {
2069 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2070 // Get the filter block key
2071 auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2072 rep_->footer.metaindex_handle(), cache_key);
2073 rep_->table_options.block_cache.get()->Erase(key);
2074 // Get the index block key
2075 key = GetCacheKeyFromOffset(rep_->cache_key_prefix,
2076 rep_->cache_key_prefix_size,
2077 rep_->dummy_index_reader_offset, cache_key);
2078 rep_->table_options.block_cache.get()->Erase(key);
2079 }
2080}
2081
2082Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
2083 out_file->Append(
2084 "Index Details:\n"
2085 "--------------------------------------\n");
2086
2087 std::unique_ptr<InternalIterator> blockhandles_iter(
2088 NewIndexIterator(ReadOptions()));
2089 Status s = blockhandles_iter->status();
2090 if (!s.ok()) {
2091 out_file->Append("Can not read Index Block \n\n");
2092 return s;
2093 }
2094
2095 out_file->Append(" Block key hex dump: Data block handle\n");
2096 out_file->Append(" Block key ascii\n\n");
2097 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2098 blockhandles_iter->Next()) {
2099 s = blockhandles_iter->status();
2100 if (!s.ok()) {
2101 break;
2102 }
2103 Slice key = blockhandles_iter->key();
2104 InternalKey ikey;
2105 ikey.DecodeFrom(key);
2106
2107 out_file->Append(" HEX ");
2108 out_file->Append(ikey.user_key().ToString(true).c_str());
2109 out_file->Append(": ");
2110 out_file->Append(blockhandles_iter->value().ToString(true).c_str());
2111 out_file->Append("\n");
2112
2113 std::string str_key = ikey.user_key().ToString();
2114 std::string res_key("");
2115 char cspace = ' ';
2116 for (size_t i = 0; i < str_key.size(); i++) {
2117 res_key.append(&str_key[i], 1);
2118 res_key.append(1, cspace);
2119 }
2120 out_file->Append(" ASCII ");
2121 out_file->Append(res_key.c_str());
2122 out_file->Append("\n ------\n");
2123 }
2124 out_file->Append("\n");
2125 return Status::OK();
2126}
2127
2128Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
2129 std::unique_ptr<InternalIterator> blockhandles_iter(
2130 NewIndexIterator(ReadOptions()));
2131 Status s = blockhandles_iter->status();
2132 if (!s.ok()) {
2133 out_file->Append("Can not read Index Block \n\n");
2134 return s;
2135 }
2136
2137 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
2138 uint64_t datablock_size_max = 0;
2139 uint64_t datablock_size_sum = 0;
2140
2141 size_t block_id = 1;
2142 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2143 block_id++, blockhandles_iter->Next()) {
2144 s = blockhandles_iter->status();
2145 if (!s.ok()) {
2146 break;
2147 }
2148
2149 Slice bh_val = blockhandles_iter->value();
2150 BlockHandle bh;
2151 bh.DecodeFrom(&bh_val);
2152 uint64_t datablock_size = bh.size();
2153 datablock_size_min = std::min(datablock_size_min, datablock_size);
2154 datablock_size_max = std::max(datablock_size_max, datablock_size);
2155 datablock_size_sum += datablock_size;
2156
2157 out_file->Append("Data Block # ");
2158 out_file->Append(rocksdb::ToString(block_id));
2159 out_file->Append(" @ ");
2160 out_file->Append(blockhandles_iter->value().ToString(true).c_str());
2161 out_file->Append("\n");
2162 out_file->Append("--------------------------------------\n");
2163
2164 std::unique_ptr<InternalIterator> datablock_iter;
2165 datablock_iter.reset(
2166 NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
2167 s = datablock_iter->status();
2168
2169 if (!s.ok()) {
2170 out_file->Append("Error reading the block - Skipped \n\n");
2171 continue;
2172 }
2173
2174 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
2175 datablock_iter->Next()) {
2176 s = datablock_iter->status();
2177 if (!s.ok()) {
2178 out_file->Append("Error reading the block - Skipped \n");
2179 break;
2180 }
2181 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
2182 }
2183 out_file->Append("\n");
2184 }
2185
2186 uint64_t num_datablocks = block_id - 1;
2187 if (num_datablocks) {
2188 double datablock_size_avg =
2189 static_cast<double>(datablock_size_sum) / num_datablocks;
2190 out_file->Append("Data Block Summary:\n");
2191 out_file->Append("--------------------------------------");
2192 out_file->Append("\n # data blocks: ");
2193 out_file->Append(rocksdb::ToString(num_datablocks));
2194 out_file->Append("\n min data block size: ");
2195 out_file->Append(rocksdb::ToString(datablock_size_min));
2196 out_file->Append("\n max data block size: ");
2197 out_file->Append(rocksdb::ToString(datablock_size_max));
2198 out_file->Append("\n avg data block size: ");
2199 out_file->Append(rocksdb::ToString(datablock_size_avg));
2200 out_file->Append("\n");
2201 }
2202
2203 return Status::OK();
2204}
2205
2206void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
2207 WritableFile* out_file) {
2208 InternalKey ikey;
2209 ikey.DecodeFrom(key);
2210
2211 out_file->Append(" HEX ");
2212 out_file->Append(ikey.user_key().ToString(true).c_str());
2213 out_file->Append(": ");
2214 out_file->Append(value.ToString(true).c_str());
2215 out_file->Append("\n");
2216
2217 std::string str_key = ikey.user_key().ToString();
2218 std::string str_value = value.ToString();
2219 std::string res_key(""), res_value("");
2220 char cspace = ' ';
2221 for (size_t i = 0; i < str_key.size(); i++) {
2222 res_key.append(&str_key[i], 1);
2223 res_key.append(1, cspace);
2224 }
2225 for (size_t i = 0; i < str_value.size(); i++) {
2226 res_value.append(&str_value[i], 1);
2227 res_value.append(1, cspace);
2228 }
2229
2230 out_file->Append(" ASCII ");
2231 out_file->Append(res_key.c_str());
2232 out_file->Append(": ");
2233 out_file->Append(res_value.c_str());
2234 out_file->Append("\n ------\n");
2235}
2236
2237namespace {
2238
2239void DeleteCachedFilterEntry(const Slice& key, void* value) {
2240 FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
2241 if (filter->statistics() != nullptr) {
2242 RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
2243 filter->size());
2244 }
2245 delete filter;
2246}
2247
2248void DeleteCachedIndexEntry(const Slice& key, void* value) {
2249 IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
2250 if (index_reader->statistics() != nullptr) {
2251 RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
2252 index_reader->usable_size());
2253 }
2254 delete index_reader;
2255}
2256
2257} // anonymous namespace
2258
2259} // namespace rocksdb