]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/block_based_table_reader.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_table_reader.cc
CommitLineData
f67539c2
TL
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9#include "table/block_based/block_based_table_reader.h"
20effc67 10
f67539c2
TL
11#include <algorithm>
12#include <array>
1e59de90
TL
13#include <atomic>
14#include <cstdint>
f67539c2 15#include <limits>
1e59de90 16#include <memory>
f67539c2 17#include <string>
1e59de90 18#include <unordered_set>
f67539c2
TL
19#include <utility>
20#include <vector>
21
1e59de90
TL
22#include "cache/cache_entry_roles.h"
23#include "cache/cache_key.h"
24#include "db/compaction/compaction_picker.h"
f67539c2
TL
25#include "db/dbformat.h"
26#include "db/pinned_iterators_manager.h"
f67539c2 27#include "file/file_prefetch_buffer.h"
20effc67 28#include "file/file_util.h"
f67539c2 29#include "file/random_access_file_reader.h"
1e59de90 30#include "logging/logging.h"
20effc67 31#include "monitoring/perf_context_imp.h"
1e59de90 32#include "port/lang.h"
f67539c2
TL
33#include "rocksdb/cache.h"
34#include "rocksdb/comparator.h"
1e59de90 35#include "rocksdb/convenience.h"
f67539c2
TL
36#include "rocksdb/env.h"
37#include "rocksdb/file_system.h"
38#include "rocksdb/filter_policy.h"
39#include "rocksdb/iterator.h"
40#include "rocksdb/options.h"
1e59de90 41#include "rocksdb/snapshot.h"
f67539c2 42#include "rocksdb/statistics.h"
1e59de90 43#include "rocksdb/system_clock.h"
f67539c2
TL
44#include "rocksdb/table.h"
45#include "rocksdb/table_properties.h"
1e59de90 46#include "rocksdb/trace_record.h"
20effc67 47#include "table/block_based/binary_search_index_reader.h"
f67539c2 48#include "table/block_based/block.h"
f67539c2 49#include "table/block_based/block_based_table_factory.h"
20effc67 50#include "table/block_based/block_based_table_iterator.h"
1e59de90 51#include "table/block_based/block_like_traits.h"
f67539c2 52#include "table/block_based/block_prefix_index.h"
1e59de90 53#include "table/block_based/block_type.h"
f67539c2 54#include "table/block_based/filter_block.h"
1e59de90 55#include "table/block_based/filter_policy_internal.h"
f67539c2 56#include "table/block_based/full_filter_block.h"
20effc67 57#include "table/block_based/hash_index_reader.h"
f67539c2 58#include "table/block_based/partitioned_filter_block.h"
20effc67 59#include "table/block_based/partitioned_index_reader.h"
f67539c2
TL
60#include "table/block_fetcher.h"
61#include "table/format.h"
62#include "table/get_context.h"
63#include "table/internal_iterator.h"
64#include "table/meta_blocks.h"
65#include "table/multiget_context.h"
66#include "table/persistent_cache_helper.h"
1e59de90 67#include "table/persistent_cache_options.h"
f67539c2
TL
68#include "table/sst_file_writer_collectors.h"
69#include "table/two_level_iterator.h"
f67539c2
TL
70#include "test_util/sync_point.h"
71#include "util/coding.h"
72#include "util/crc32c.h"
73#include "util/stop_watch.h"
74#include "util/string_util.h"
f67539c2
TL
75
76namespace ROCKSDB_NAMESPACE {
1e59de90 77namespace {
f67539c2 78
1e59de90
TL
79CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
80 CacheAllocationPtr heap_buf;
81 heap_buf = AllocateBlock(buf.size(), allocator);
82 memcpy(heap_buf.get(), buf.data(), buf.size());
83 return heap_buf;
f67539c2 84}
1e59de90
TL
85} // namespace
86} // namespace ROCKSDB_NAMESPACE
87
88// Generate the regular and coroutine versions of some methods by
89// including block_based_table_reader_sync_and_async.h twice
90// Macros in the header will expand differently based on whether
91// WITH_COROUTINES or WITHOUT_COROUTINES is defined
92// clang-format off
93#define WITHOUT_COROUTINES
94#include "table/block_based/block_based_table_reader_sync_and_async.h"
95#undef WITHOUT_COROUTINES
96#define WITH_COROUTINES
97#include "table/block_based/block_based_table_reader_sync_and_async.h"
98#undef WITH_COROUTINES
99// clang-format on
f67539c2 100
1e59de90 101namespace ROCKSDB_NAMESPACE {
f67539c2 102
1e59de90
TL
103extern const uint64_t kBlockBasedTableMagicNumber;
104extern const std::string kHashIndexPrefixesBlock;
105extern const std::string kHashIndexPrefixesMetadataBlock;
f67539c2 106
1e59de90 107BlockBasedTable::~BlockBasedTable() { delete rep_; }
f67539c2
TL
108
109namespace {
110// Read the block identified by "handle" from "file".
111// The only relevant option is options.verify_checksums for now.
112// On failure return non-OK.
113// On success fill *result and return OK - caller owns *result
114// @param uncompression_dict Data for presetting the compression library's
115// dictionary.
116template <typename TBlocklike>
117Status ReadBlockFromFile(
118 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
119 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
1e59de90 120 std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
f67539c2
TL
121 bool do_uncompress, bool maybe_compressed, BlockType block_type,
122 const UncompressionDict& uncompression_dict,
20effc67
TL
123 const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
124 MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
1e59de90 125 const FilterPolicy* filter_policy, bool async_read) {
f67539c2
TL
126 assert(result);
127
128 BlockContents contents;
129 BlockFetcher block_fetcher(
130 file, prefetch_buffer, footer, options, handle, &contents, ioptions,
131 do_uncompress, maybe_compressed, block_type, uncompression_dict,
132 cache_options, memory_allocator, nullptr, for_compaction);
1e59de90
TL
133 Status s;
134 // If prefetch_buffer is not allocated, it will fallback to synchronous
135 // reading of block contents.
136 if (async_read && prefetch_buffer != nullptr) {
137 s = block_fetcher.ReadAsyncBlockContents();
138 if (!s.ok()) {
139 return s;
140 }
141 } else {
142 s = block_fetcher.ReadBlockContents();
143 }
f67539c2
TL
144 if (s.ok()) {
145 result->reset(BlocklikeTraits<TBlocklike>::Create(
1e59de90
TL
146 std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
147 filter_policy));
f67539c2
TL
148 }
149
150 return s;
151}
152
1e59de90
TL
153// For hash based index, return false if table_properties->prefix_extractor_name
154// and prefix_extractor both exist and match, otherwise true.
155inline bool PrefixExtractorChangedHelper(
156 const TableProperties* table_properties,
157 const SliceTransform* prefix_extractor) {
f67539c2
TL
158 // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
159 // Turn off hash index in prefix_extractor is not set; if prefix_extractor
160 // is set but prefix_extractor_block is not set, also disable hash index
161 if (prefix_extractor == nullptr || table_properties == nullptr ||
162 table_properties->prefix_extractor_name.empty()) {
163 return true;
164 }
165
166 // prefix_extractor and prefix_extractor_block are both non-empty
1e59de90 167 if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) {
f67539c2
TL
168 return true;
169 } else {
170 return false;
171 }
172}
173
f67539c2
TL
174} // namespace
175
f67539c2
TL
176void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
177 GetContext* get_context,
178 size_t usage) const {
1e59de90 179 Statistics* const statistics = rep_->ioptions.stats;
f67539c2
TL
180
181 PERF_COUNTER_ADD(block_cache_hit_count, 1);
182 PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
183 static_cast<uint32_t>(rep_->level));
184
185 if (get_context) {
186 ++get_context->get_context_stats_.num_cache_hit;
187 get_context->get_context_stats_.num_cache_bytes_read += usage;
188 } else {
189 RecordTick(statistics, BLOCK_CACHE_HIT);
190 RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
191 }
192
193 switch (block_type) {
194 case BlockType::kFilter:
1e59de90 195 case BlockType::kFilterPartitionIndex:
f67539c2
TL
196 PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
197
198 if (get_context) {
199 ++get_context->get_context_stats_.num_cache_filter_hit;
200 } else {
201 RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
202 }
203 break;
204
205 case BlockType::kCompressionDictionary:
206 // TODO: introduce perf counter for compression dictionary hit count
207 if (get_context) {
208 ++get_context->get_context_stats_.num_cache_compression_dict_hit;
209 } else {
210 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
211 }
212 break;
213
214 case BlockType::kIndex:
215 PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
216
217 if (get_context) {
218 ++get_context->get_context_stats_.num_cache_index_hit;
219 } else {
220 RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
221 }
222 break;
223
224 default:
225 // TODO: introduce dedicated tickers/statistics/counters
226 // for range tombstones
227 if (get_context) {
228 ++get_context->get_context_stats_.num_cache_data_hit;
229 } else {
230 RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
231 }
232 break;
233 }
234}
235
236void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
237 GetContext* get_context) const {
1e59de90 238 Statistics* const statistics = rep_->ioptions.stats;
f67539c2
TL
239
240 // TODO: introduce aggregate (not per-level) block cache miss count
241 PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
242 static_cast<uint32_t>(rep_->level));
243
244 if (get_context) {
245 ++get_context->get_context_stats_.num_cache_miss;
246 } else {
247 RecordTick(statistics, BLOCK_CACHE_MISS);
248 }
249
250 // TODO: introduce perf counters for misses per block type
251 switch (block_type) {
252 case BlockType::kFilter:
1e59de90 253 case BlockType::kFilterPartitionIndex:
f67539c2
TL
254 if (get_context) {
255 ++get_context->get_context_stats_.num_cache_filter_miss;
256 } else {
257 RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
258 }
259 break;
260
261 case BlockType::kCompressionDictionary:
262 if (get_context) {
263 ++get_context->get_context_stats_.num_cache_compression_dict_miss;
264 } else {
265 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
266 }
267 break;
268
269 case BlockType::kIndex:
270 if (get_context) {
271 ++get_context->get_context_stats_.num_cache_index_miss;
272 } else {
273 RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
274 }
275 break;
276
277 default:
278 // TODO: introduce dedicated tickers/statistics/counters
279 // for range tombstones
280 if (get_context) {
281 ++get_context->get_context_stats_.num_cache_data_miss;
282 } else {
283 RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
284 }
285 break;
286 }
287}
288
1e59de90
TL
289void BlockBasedTable::UpdateCacheInsertionMetrics(
290 BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
291 Statistics* const statistics) {
f67539c2
TL
292 // TODO: introduce perf counters for block cache insertions
293 if (get_context) {
294 ++get_context->get_context_stats_.num_cache_add;
20effc67
TL
295 if (redundant) {
296 ++get_context->get_context_stats_.num_cache_add_redundant;
297 }
f67539c2
TL
298 get_context->get_context_stats_.num_cache_bytes_write += usage;
299 } else {
300 RecordTick(statistics, BLOCK_CACHE_ADD);
20effc67
TL
301 if (redundant) {
302 RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT);
303 }
f67539c2
TL
304 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
305 }
306
307 switch (block_type) {
308 case BlockType::kFilter:
1e59de90 309 case BlockType::kFilterPartitionIndex:
f67539c2
TL
310 if (get_context) {
311 ++get_context->get_context_stats_.num_cache_filter_add;
20effc67
TL
312 if (redundant) {
313 ++get_context->get_context_stats_.num_cache_filter_add_redundant;
314 }
f67539c2
TL
315 get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
316 } else {
317 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
20effc67
TL
318 if (redundant) {
319 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT);
320 }
f67539c2
TL
321 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
322 }
323 break;
324
325 case BlockType::kCompressionDictionary:
326 if (get_context) {
327 ++get_context->get_context_stats_.num_cache_compression_dict_add;
20effc67
TL
328 if (redundant) {
329 ++get_context->get_context_stats_
330 .num_cache_compression_dict_add_redundant;
331 }
f67539c2
TL
332 get_context->get_context_stats_
333 .num_cache_compression_dict_bytes_insert += usage;
334 } else {
335 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
20effc67
TL
336 if (redundant) {
337 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT);
338 }
f67539c2
TL
339 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
340 usage);
341 }
342 break;
343
344 case BlockType::kIndex:
345 if (get_context) {
346 ++get_context->get_context_stats_.num_cache_index_add;
20effc67
TL
347 if (redundant) {
348 ++get_context->get_context_stats_.num_cache_index_add_redundant;
349 }
f67539c2
TL
350 get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
351 } else {
352 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
20effc67
TL
353 if (redundant) {
354 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT);
355 }
f67539c2
TL
356 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
357 }
358 break;
359
360 default:
361 // TODO: introduce dedicated tickers/statistics/counters
362 // for range tombstones
363 if (get_context) {
364 ++get_context->get_context_stats_.num_cache_data_add;
20effc67
TL
365 if (redundant) {
366 ++get_context->get_context_stats_.num_cache_data_add_redundant;
367 }
f67539c2
TL
368 get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
369 } else {
370 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
20effc67
TL
371 if (redundant) {
372 RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT);
373 }
f67539c2
TL
374 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
375 }
376 break;
377 }
378}
379
380Cache::Handle* BlockBasedTable::GetEntryFromCache(
1e59de90
TL
381 const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
382 BlockType block_type, const bool wait, GetContext* get_context,
383 const Cache::CacheItemHelper* cache_helper,
384 const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
385 Cache::Handle* cache_handle = nullptr;
386 if (cache_tier == CacheTier::kNonVolatileBlockTier) {
387 cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority,
388 wait, rep_->ioptions.statistics.get());
f67539c2 389 } else {
1e59de90
TL
390 cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get());
391 }
392
393 // Avoid updating metrics here if the handle is not complete yet. This
394 // happens with MultiGet and secondary cache. So update the metrics only
395 // if its a miss, or a hit and value is ready
396 if (!cache_handle || block_cache->Value(cache_handle)) {
397 if (cache_handle != nullptr) {
398 UpdateCacheHitMetrics(block_type, get_context,
399 block_cache->GetUsage(cache_handle));
400 } else {
401 UpdateCacheMissMetrics(block_type, get_context);
402 }
f67539c2
TL
403 }
404
405 return cache_handle;
406}
407
1e59de90
TL
408template <typename TBlocklike>
409Status BlockBasedTable::InsertEntryToCache(
410 const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
411 const Cache::CacheItemHelper* cache_helper,
412 std::unique_ptr<TBlocklike>&& block_holder, size_t charge,
413 Cache::Handle** cache_handle, Cache::Priority priority) const {
414 Status s = Status::OK();
415 if (cache_tier == CacheTier::kNonVolatileBlockTier) {
416 s = block_cache->Insert(key, block_holder.get(), cache_helper, charge,
417 cache_handle, priority);
418 } else {
419 s = block_cache->Insert(key, block_holder.get(), charge,
420 cache_helper->del_cb, cache_handle, priority);
421 }
422 if (s.ok()) {
423 // Cache took ownership
424 block_holder.release();
f67539c2 425 }
1e59de90
TL
426 s.MustCheck();
427 return s;
f67539c2
TL
428}
429
430namespace {
431// Return True if table_properties has `user_prop_name` has a `true` value
432// or it doesn't contain this property (for backward compatible).
433bool IsFeatureSupported(const TableProperties& table_properties,
434 const std::string& user_prop_name, Logger* info_log) {
435 auto& props = table_properties.user_collected_properties;
436 auto pos = props.find(user_prop_name);
437 // Older version doesn't have this value set. Skip this check.
438 if (pos != props.end()) {
439 if (pos->second == kPropFalse) {
440 return false;
441 } else if (pos->second != kPropTrue) {
442 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
443 user_prop_name.c_str(), pos->second.c_str());
444 }
445 }
446 return true;
447}
448
449// Caller has to ensure seqno is not nullptr.
450Status GetGlobalSequenceNumber(const TableProperties& table_properties,
451 SequenceNumber largest_seqno,
452 SequenceNumber* seqno) {
453 const auto& props = table_properties.user_collected_properties;
454 const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
455 const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
456
457 *seqno = kDisableGlobalSequenceNumber;
458 if (version_pos == props.end()) {
459 if (seqno_pos != props.end()) {
460 std::array<char, 200> msg_buf;
461 // This is not an external sst file, global_seqno is not supported.
462 snprintf(
463 msg_buf.data(), msg_buf.max_size(),
464 "A non-external sst file have global seqno property with value %s",
465 seqno_pos->second.c_str());
466 return Status::Corruption(msg_buf.data());
467 }
468 return Status::OK();
469 }
470
471 uint32_t version = DecodeFixed32(version_pos->second.c_str());
472 if (version < 2) {
473 if (seqno_pos != props.end() || version != 1) {
474 std::array<char, 200> msg_buf;
475 // This is a v1 external sst file, global_seqno is not supported.
476 snprintf(msg_buf.data(), msg_buf.max_size(),
477 "An external sst file with version %u have global seqno "
478 "property with value %s",
479 version, seqno_pos->second.c_str());
480 return Status::Corruption(msg_buf.data());
481 }
482 return Status::OK();
483 }
484
485 // Since we have a plan to deprecate global_seqno, we do not return failure
486 // if seqno_pos == props.end(). We rely on version_pos to detect whether the
487 // SST is external.
488 SequenceNumber global_seqno(0);
489 if (seqno_pos != props.end()) {
490 global_seqno = DecodeFixed64(seqno_pos->second.c_str());
491 }
492 // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
493 // to denote it is unknown.
494 if (largest_seqno < kMaxSequenceNumber) {
495 if (global_seqno == 0) {
496 global_seqno = largest_seqno;
497 }
498 if (global_seqno != largest_seqno) {
499 std::array<char, 200> msg_buf;
500 snprintf(
501 msg_buf.data(), msg_buf.max_size(),
502 "An external sst file with version %u have global seqno property "
503 "with value %s, while largest seqno in the file is %llu",
504 version, seqno_pos->second.c_str(),
505 static_cast<unsigned long long>(largest_seqno));
506 return Status::Corruption(msg_buf.data());
507 }
508 }
509 *seqno = global_seqno;
510
511 if (global_seqno > kMaxSequenceNumber) {
512 std::array<char, 200> msg_buf;
513 snprintf(msg_buf.data(), msg_buf.max_size(),
514 "An external sst file with version %u have global seqno property "
515 "with value %llu, which is greater than kMaxSequenceNumber",
516 version, static_cast<unsigned long long>(global_seqno));
517 return Status::Corruption(msg_buf.data());
518 }
519
520 return Status::OK();
521}
522} // namespace
523
1e59de90
TL
524void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
525 const std::string& cur_db_session_id,
526 uint64_t cur_file_number,
527 OffsetableCacheKey* out_base_cache_key,
528 bool* out_is_stable) {
529 // Use a stable cache key if sufficient data is in table properties
530 std::string db_session_id;
531 uint64_t file_num;
532 std::string db_id;
533 if (properties && !properties->db_session_id.empty() &&
534 properties->orig_file_number > 0) {
535 // (Newer SST file case)
536 // We must have both properties to get a stable unique id because
537 // CreateColumnFamilyWithImport or IngestExternalFiles can change the
538 // file numbers on a file.
539 db_session_id = properties->db_session_id;
540 file_num = properties->orig_file_number;
541 // Less critical, populated in earlier release than above
542 db_id = properties->db_id;
543 if (out_is_stable) {
544 *out_is_stable = true;
545 }
546 } else {
547 // (Old SST file case)
548 // We use (unique) cache keys based on current identifiers. These are at
549 // least stable across table file close and re-open, but not across
550 // different DBs nor DB close and re-open.
551 db_session_id = cur_db_session_id;
552 file_num = cur_file_number;
553 // Plumbing through the DB ID to here would be annoying, and of limited
554 // value because of the case of VersionSet::Recover opening some table
555 // files and later setting the DB ID. So we just rely on uniqueness
556 // level provided by session ID.
557 db_id = "unknown";
558 if (out_is_stable) {
559 *out_is_stable = false;
560 }
561 }
562
563 // Too many tests to update to get these working
564 // assert(file_num > 0);
565 // assert(!db_session_id.empty());
566 // assert(!db_id.empty());
567
568 // Minimum block size is 5 bytes; therefore we can trim off two lower bits
569 // from offsets. See GetCacheKey.
570 *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num);
571}
572
573CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
574 const BlockHandle& handle) {
575 // Minimum block size is 5 bytes; therefore we can trim off two lower bits
576 // from offet.
577 return base_cache_key.WithOffset(handle.offset() >> 2);
f67539c2
TL
578}
579
580Status BlockBasedTable::Open(
1e59de90 581 const ReadOptions& read_options, const ImmutableOptions& ioptions,
20effc67 582 const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
f67539c2
TL
583 const InternalKeyComparator& internal_comparator,
584 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
585 std::unique_ptr<TableReader>* table_reader,
1e59de90
TL
586 std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
587 const std::shared_ptr<const SliceTransform>& prefix_extractor,
f67539c2
TL
588 const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
589 const int level, const bool immortal_table,
20effc67
TL
590 const SequenceNumber largest_seqno, const bool force_direct_prefetch,
591 TailPrefetchStats* tail_prefetch_stats,
592 BlockCacheTracer* const block_cache_tracer,
1e59de90
TL
593 size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
594 uint64_t cur_file_num, UniqueId64x2 expected_unique_id) {
f67539c2
TL
595 table_reader->reset();
596
597 Status s;
598 Footer footer;
599 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
600
1e59de90 601 // From read_options, retain deadline, io_timeout, and rate_limiter_priority.
20effc67 602 // In future, we may retain more
1e59de90 603 // options. Specifically, we ignore verify_checksums and default to
20effc67
TL
604 // checksum verification anyway when creating the index and filter
605 // readers.
606 ReadOptions ro;
607 ro.deadline = read_options.deadline;
608 ro.io_timeout = read_options.io_timeout;
1e59de90 609 ro.rate_limiter_priority = read_options.rate_limiter_priority;
20effc67 610
f67539c2
TL
611 // prefetch both index and filters, down to all partitions
612 const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
613 const bool preload_all = !table_options.cache_index_and_filter_blocks;
614
615 if (!ioptions.allow_mmap_reads) {
20effc67
TL
616 s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
617 tail_prefetch_stats, prefetch_all, preload_all,
618 &prefetch_buffer);
619 // Return error in prefetch path to users.
620 if (!s.ok()) {
621 return s;
622 }
f67539c2
TL
623 } else {
624 // Should not prefetch for mmap mode.
625 prefetch_buffer.reset(new FilePrefetchBuffer(
1e59de90
TL
626 0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */,
627 true /* track_min_offset */));
f67539c2
TL
628 }
629
630 // Read in the following order:
631 // 1. Footer
632 // 2. [metaindex block]
633 // 3. [meta block: properties]
634 // 4. [meta block: range deletion tombstone]
635 // 5. [meta block: compression dictionary]
636 // 6. [meta block: index]
637 // 7. [meta block: filter]
20effc67 638 IOOptions opts;
1e59de90 639 s = file->PrepareIOOptions(ro, opts);
20effc67
TL
640 if (s.ok()) {
641 s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
642 &footer, kBlockBasedTableMagicNumber);
643 }
f67539c2
TL
644 if (!s.ok()) {
645 return s;
646 }
1e59de90 647 if (!IsSupportedFormatVersion(footer.format_version())) {
f67539c2
TL
648 return Status::Corruption(
649 "Unknown Footer version. Maybe this file was created with newer "
650 "version of RocksDB?");
651 }
652
f67539c2
TL
653 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
654 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
20effc67
TL
655 internal_comparator, skip_filters,
656 file_size, level, immortal_table);
f67539c2
TL
657 rep->file = std::move(file);
658 rep->footer = footer;
f67539c2 659
1e59de90
TL
660 // For fully portable/stable cache keys, we need to read the properties
661 // block before setting up cache keys. TODO: consider setting up a bootstrap
662 // cache key for PersistentCache to use for metaindex and properties blocks.
663 rep->persistent_cache_options = PersistentCacheOptions();
f67539c2
TL
664
665 // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
666 // handle to null, otherwise it may be seen as uninitialized during the below
667 // meta-block reads.
668 rep->compression_dict_handle = BlockHandle::NullBlockHandle();
669
670 // Read metaindex
1e59de90
TL
671 std::unique_ptr<BlockBasedTable> new_table(
672 new BlockBasedTable(rep, block_cache_tracer));
f67539c2
TL
673 std::unique_ptr<Block> metaindex;
674 std::unique_ptr<InternalIterator> metaindex_iter;
20effc67 675 s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
f67539c2
TL
676 &metaindex_iter);
677 if (!s.ok()) {
678 return s;
679 }
680
681 // Populates table_properties and some fields that depend on it,
682 // such as index_type.
20effc67 683 s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(),
f67539c2
TL
684 metaindex_iter.get(), largest_seqno);
685 if (!s.ok()) {
686 return s;
687 }
1e59de90
TL
688
689 // Check expected unique id if provided
690 if (expected_unique_id != kNullUniqueId64x2) {
691 auto props = rep->table_properties;
692 if (!props) {
693 return Status::Corruption("Missing table properties on file " +
694 std::to_string(cur_file_num) +
695 " with known unique ID");
696 }
697 UniqueId64x2 actual_unique_id{};
698 s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
699 props->orig_file_number, &actual_unique_id,
700 /*force*/ true);
701 assert(s.ok()); // because force=true
702 if (expected_unique_id != actual_unique_id) {
703 return Status::Corruption(
704 "Mismatch in unique ID on table file " +
705 std::to_string(cur_file_num) +
706 ". Expected: " + InternalUniqueIdToHumanString(&expected_unique_id) +
707 " Actual: " + InternalUniqueIdToHumanString(&actual_unique_id));
708 }
709 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::PassedVerifyUniqueId",
710 &actual_unique_id);
711 } else {
712 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::SkippedVerifyUniqueId",
713 nullptr);
714 if (ioptions.verify_sst_unique_id_in_manifest && ioptions.logger) {
715 // A crude but isolated way of reporting unverified files. This should not
716 // be an ongoing concern so doesn't deserve a place in Statistics IMHO.
717 static std::atomic<uint64_t> unverified_count{0};
718 auto prev_count =
719 unverified_count.fetch_add(1, std::memory_order_relaxed);
720 if (prev_count == 0) {
721 ROCKS_LOG_WARN(
722 ioptions.logger,
723 "At least one SST file opened without unique ID to verify: %" PRIu64
724 ".sst",
725 cur_file_num);
726 } else if (prev_count % 1000 == 0) {
727 ROCKS_LOG_WARN(
728 ioptions.logger,
729 "Another ~1000 SST files opened without unique ID to verify");
730 }
731 }
732 }
733
734 // Set up prefix extracto as needed
735 bool force_null_table_prefix_extractor = false;
736 TEST_SYNC_POINT_CALLBACK(
737 "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
738 &force_null_table_prefix_extractor);
739 if (force_null_table_prefix_extractor) {
740 assert(!rep->table_prefix_extractor);
741 } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(),
742 prefix_extractor.get())) {
743 // Establish fast path for unchanged prefix_extractor
744 rep->table_prefix_extractor = prefix_extractor;
745 } else {
746 // Current prefix_extractor doesn't match table
747#ifndef ROCKSDB_LITE
748 if (rep->table_properties) {
749 //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
750 // will need to use it
751 ConfigOptions config_options;
752 Status st = SliceTransform::CreateFromString(
753 config_options, rep->table_properties->prefix_extractor_name,
754 &(rep->table_prefix_extractor));
755 if (!st.ok()) {
756 //**TODO: Should this be error be returned or swallowed?
757 ROCKS_LOG_ERROR(rep->ioptions.logger,
758 "Failed to create prefix extractor[%s]: %s",
759 rep->table_properties->prefix_extractor_name.c_str(),
760 st.ToString().c_str());
761 }
762 }
763#endif // ROCKSDB_LITE
764 }
765
766 // With properties loaded, we can set up portable/stable cache keys
767 SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
768 cur_file_num, &rep->base_cache_key);
769
770 rep->persistent_cache_options =
771 PersistentCacheOptions(rep->table_options.persistent_cache,
772 rep->base_cache_key, rep->ioptions.stats);
773
20effc67
TL
774 s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
775 metaindex_iter.get(), internal_comparator,
776 &lookup_context);
f67539c2
TL
777 if (!s.ok()) {
778 return s;
779 }
780 s = new_table->PrefetchIndexAndFilterBlocks(
20effc67
TL
781 ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
782 prefetch_all, table_options, level, file_size,
783 max_file_size_for_l0_meta_pin, &lookup_context);
f67539c2
TL
784
785 if (s.ok()) {
786 // Update tail prefetch stats
787 assert(prefetch_buffer.get() != nullptr);
788 if (tail_prefetch_stats != nullptr) {
789 assert(prefetch_buffer->min_offset_read() < file_size);
790 tail_prefetch_stats->RecordEffectiveSize(
791 static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
792 }
1e59de90 793 }
f67539c2 794
1e59de90
TL
795 if (s.ok() && table_reader_cache_res_mgr) {
796 std::size_t mem_usage = new_table->ApproximateMemoryUsage();
797 s = table_reader_cache_res_mgr->MakeCacheReservation(
798 mem_usage, &(rep->table_reader_cache_res_handle));
799 if (s.IsMemoryLimit()) {
800 s = Status::MemoryLimit(
801 "Can't allocate " +
802 kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
803 CacheEntryRole::kBlockBasedTableReader)] +
804 " due to memory limit based on "
805 "cache capacity for memory allocation");
806 }
f67539c2
TL
807 }
808
1e59de90
TL
809 if (s.ok()) {
810 *table_reader = std::move(new_table);
811 }
f67539c2
TL
812 return s;
813}
814
815Status BlockBasedTable::PrefetchTail(
20effc67
TL
816 const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
817 bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
818 const bool prefetch_all, const bool preload_all,
f67539c2
TL
819 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
820 size_t tail_prefetch_size = 0;
821 if (tail_prefetch_stats != nullptr) {
822 // Multiple threads may get a 0 (no history) when running in parallel,
823 // but it will get cleared after the first of them finishes.
824 tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
825 }
826 if (tail_prefetch_size == 0) {
827 // Before read footer, readahead backwards to prefetch data. Do more
828 // readahead if we're going to read index/filter.
829 // TODO: This may incorrectly select small readahead in case partitioned
830 // index/filter is enabled and top-level partition pinning is enabled.
831 // That's because we need to issue readahead before we read the properties,
832 // at which point we don't yet know the index type.
833 tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
834 }
835 size_t prefetch_off;
836 size_t prefetch_len;
837 if (file_size < tail_prefetch_size) {
838 prefetch_off = 0;
839 prefetch_len = static_cast<size_t>(file_size);
840 } else {
841 prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
842 prefetch_len = tail_prefetch_size;
843 }
844 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
845 &tail_prefetch_size);
f67539c2 846
20effc67
TL
847 // Try file system prefetch
848 if (!file->use_direct_io() && !force_direct_prefetch) {
1e59de90
TL
849 if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
850 .IsNotSupported()) {
851 prefetch_buffer->reset(new FilePrefetchBuffer(
852 0 /* readahead_size */, 0 /* max_readahead_size */,
853 false /* enable */, true /* track_min_offset */));
20effc67
TL
854 return Status::OK();
855 }
f67539c2 856 }
20effc67
TL
857
858 // Use `FilePrefetchBuffer`
1e59de90
TL
859 prefetch_buffer->reset(
860 new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
861 true /* enable */, true /* track_min_offset */));
862
20effc67 863 IOOptions opts;
1e59de90 864 Status s = file->PrepareIOOptions(ro, opts);
20effc67 865 if (s.ok()) {
1e59de90
TL
866 s = (*prefetch_buffer)
867 ->Prefetch(opts, file, prefetch_off, prefetch_len,
868 ro.rate_limiter_priority);
f67539c2
TL
869 }
870 return s;
871}
872
873Status BlockBasedTable::ReadPropertiesBlock(
20effc67
TL
874 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
875 InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
f67539c2 876 Status s;
1e59de90
TL
877 BlockHandle handle;
878 s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
f67539c2
TL
879
880 if (!s.ok()) {
1e59de90 881 ROCKS_LOG_WARN(rep_->ioptions.logger,
f67539c2
TL
882 "Error when seeking to properties block from file: %s",
883 s.ToString().c_str());
1e59de90 884 } else if (!handle.IsNull()) {
f67539c2 885 s = meta_iter->status();
1e59de90 886 std::unique_ptr<TableProperties> table_properties;
f67539c2 887 if (s.ok()) {
1e59de90
TL
888 s = ReadTablePropertiesHelper(
889 ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
890 rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
f67539c2 891 }
20effc67 892 IGNORE_STATUS_IF_ERROR(s);
f67539c2 893
f67539c2 894 if (!s.ok()) {
1e59de90 895 ROCKS_LOG_WARN(rep_->ioptions.logger,
f67539c2
TL
896 "Encountered error while reading data from properties "
897 "block %s",
898 s.ToString().c_str());
899 } else {
900 assert(table_properties != nullptr);
1e59de90 901 rep_->table_properties = std::move(table_properties);
f67539c2
TL
902 rep_->blocks_maybe_compressed =
903 rep_->table_properties->compression_name !=
904 CompressionTypeToString(kNoCompression);
905 rep_->blocks_definitely_zstd_compressed =
906 (rep_->table_properties->compression_name ==
907 CompressionTypeToString(kZSTD) ||
908 rep_->table_properties->compression_name ==
909 CompressionTypeToString(kZSTDNotFinalCompression));
910 }
911 } else {
1e59de90 912 ROCKS_LOG_ERROR(rep_->ioptions.logger,
f67539c2
TL
913 "Cannot find Properties block from file.");
914 }
f67539c2
TL
915
916 // Read the table properties, if provided.
917 if (rep_->table_properties) {
918 rep_->whole_key_filtering &=
919 IsFeatureSupported(*(rep_->table_properties),
920 BlockBasedTablePropertyNames::kWholeKeyFiltering,
1e59de90
TL
921 rep_->ioptions.logger);
922 rep_->prefix_filtering &= IsFeatureSupported(
923 *(rep_->table_properties),
924 BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
f67539c2
TL
925
926 rep_->index_key_includes_seq =
927 rep_->table_properties->index_key_is_user_key == 0;
928 rep_->index_value_is_full =
929 rep_->table_properties->index_value_is_delta_encoded == 0;
930
931 // Update index_type with the true type.
932 // If table properties don't contain index type, we assume that the table
933 // is in very old format and has kBinarySearch index type.
934 auto& props = rep_->table_properties->user_collected_properties;
935 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
936 if (pos != props.end()) {
937 rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
938 DecodeFixed32(pos->second.c_str()));
939 }
940
941 rep_->index_has_first_key =
942 rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
943
944 s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
945 &(rep_->global_seqno));
946 if (!s.ok()) {
1e59de90 947 ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
f67539c2
TL
948 }
949 }
950 return s;
951}
952
953Status BlockBasedTable::ReadRangeDelBlock(
20effc67
TL
954 const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer,
955 InternalIterator* meta_iter,
f67539c2
TL
956 const InternalKeyComparator& internal_comparator,
957 BlockCacheLookupContext* lookup_context) {
958 Status s;
f67539c2 959 BlockHandle range_del_handle;
1e59de90 960 s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
f67539c2
TL
961 if (!s.ok()) {
962 ROCKS_LOG_WARN(
1e59de90 963 rep_->ioptions.logger,
f67539c2
TL
964 "Error when seeking to range delete tombstones block from file: %s",
965 s.ToString().c_str());
1e59de90
TL
966 } else if (!range_del_handle.IsNull()) {
967 Status tmp_status;
f67539c2
TL
968 std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
969 read_options, range_del_handle,
970 /*input_iter=*/nullptr, BlockType::kRangeDeletion,
1e59de90
TL
971 /*get_context=*/nullptr, lookup_context, prefetch_buffer,
972 /*for_compaction= */ false, /*async_read= */ false, tmp_status));
f67539c2
TL
973 assert(iter != nullptr);
974 s = iter->status();
975 if (!s.ok()) {
976 ROCKS_LOG_WARN(
1e59de90 977 rep_->ioptions.logger,
f67539c2
TL
978 "Encountered error while reading data from range del block %s",
979 s.ToString().c_str());
20effc67 980 IGNORE_STATUS_IF_ERROR(s);
f67539c2
TL
981 } else {
982 rep_->fragmented_range_dels =
983 std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
984 internal_comparator);
985 }
986 }
987 return s;
988}
989
990Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
20effc67
TL
991 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
992 InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all,
f67539c2 993 const BlockBasedTableOptions& table_options, const int level,
20effc67 994 size_t file_size, size_t max_file_size_for_l0_meta_pin,
f67539c2 995 BlockCacheLookupContext* lookup_context) {
f67539c2
TL
996 // Find filter handle and filter type
997 if (rep_->filter_policy) {
1e59de90
TL
998 auto name = rep_->filter_policy->CompatibilityName();
999 bool builtin_compatible =
1000 strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0;
1001
1002 for (const auto& [filter_type, prefix] :
1003 {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix),
1004 std::make_pair(Rep::FilterType::kPartitionedFilter,
1005 kPartitionedFilterBlockPrefix),
1006 std::make_pair(Rep::FilterType::kNoFilter,
1007 kObsoleteFilterBlockPrefix)}) {
1008 if (builtin_compatible) {
1009 // This code is only here to deal with a hiccup in early 7.0.x where
1010 // there was an unintentional name change in the SST files metadata.
1011 // It should be OK to remove this in the future (late 2022) and just
1012 // have the 'else' code.
1013 // NOTE: the test:: names below are likely not needed but included
1014 // out of caution
1015 static const std::unordered_set<std::string> kBuiltinNameAndAliases = {
1016 BuiltinFilterPolicy::kCompatibilityName(),
1017 test::LegacyBloomFilterPolicy::kClassName(),
1018 test::FastLocalBloomFilterPolicy::kClassName(),
1019 test::Standard128RibbonFilterPolicy::kClassName(),
1020 "rocksdb.internal.DeprecatedBlockBasedBloomFilter",
1021 BloomFilterPolicy::kClassName(),
1022 RibbonFilterPolicy::kClassName(),
1023 };
1024
1025 // For efficiency, do a prefix seek and see if the first match is
1026 // good.
1027 meta_iter->Seek(prefix);
1028 if (meta_iter->status().ok() && meta_iter->Valid()) {
1029 Slice key = meta_iter->key();
1030 if (key.starts_with(prefix)) {
1031 key.remove_prefix(prefix.size());
1032 if (kBuiltinNameAndAliases.find(key.ToString()) !=
1033 kBuiltinNameAndAliases.end()) {
1034 Slice v = meta_iter->value();
1035 Status s = rep_->filter_handle.DecodeFrom(&v);
1036 if (s.ok()) {
1037 rep_->filter_type = filter_type;
1038 if (filter_type == Rep::FilterType::kNoFilter) {
1039 ROCKS_LOG_WARN(rep_->ioptions.logger,
1040 "Detected obsolete filter type in %s. Read "
1041 "performance might suffer until DB is fully "
1042 "re-compacted.",
1043 rep_->file->file_name().c_str());
1044 }
1045 break;
1046 }
1047 }
1048 }
1049 }
1050 } else {
1051 std::string filter_block_key = prefix + name;
1052 if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1053 .ok()) {
1054 rep_->filter_type = filter_type;
1055 if (filter_type == Rep::FilterType::kNoFilter) {
1056 ROCKS_LOG_WARN(
1057 rep_->ioptions.logger,
1058 "Detected obsolete filter type in %s. Read performance might "
1059 "suffer until DB is fully re-compacted.",
1060 rep_->file->file_name().c_str());
1061 }
f67539c2 1062 break;
1e59de90 1063 }
f67539c2
TL
1064 }
1065 }
1066 }
20effc67
TL
1067 // Partition filters cannot be enabled without partition indexes
1068 assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
1069 rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
f67539c2
TL
1070
1071 // Find compression dictionary handle
1e59de90
TL
1072 Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
1073 &rep_->compression_dict_handle);
f67539c2
TL
1074 if (!s.ok()) {
1075 return s;
1076 }
1077
1078 BlockBasedTableOptions::IndexType index_type = rep_->index_type;
1079
1080 const bool use_cache = table_options.cache_index_and_filter_blocks;
1081
20effc67
TL
1082 const bool maybe_flushed =
1083 level == 0 && file_size <= max_file_size_for_l0_meta_pin;
1084 std::function<bool(PinningTier, PinningTier)> is_pinned =
1085 [maybe_flushed, &is_pinned](PinningTier pinning_tier,
1086 PinningTier fallback_pinning_tier) {
1087 // Fallback to fallback would lead to infinite recursion. Disallow it.
1088 assert(fallback_pinning_tier != PinningTier::kFallback);
1089
1090 switch (pinning_tier) {
1091 case PinningTier::kFallback:
1092 return is_pinned(fallback_pinning_tier,
1093 PinningTier::kNone /* fallback_pinning_tier */);
1094 case PinningTier::kNone:
1095 return false;
1096 case PinningTier::kFlushedAndSimilar:
1097 return maybe_flushed;
1098 case PinningTier::kAll:
1099 return true;
1100 };
1101
1102 // In GCC, this is needed to suppress `control reaches end of non-void
1103 // function [-Werror=return-type]`.
1104 assert(false);
1105 return false;
1106 };
1107 const bool pin_top_level_index = is_pinned(
1108 table_options.metadata_cache_options.top_level_index_pinning,
1109 table_options.pin_top_level_index_and_filter ? PinningTier::kAll
1110 : PinningTier::kNone);
1111 const bool pin_partition =
1112 is_pinned(table_options.metadata_cache_options.partition_pinning,
1113 table_options.pin_l0_filter_and_index_blocks_in_cache
1114 ? PinningTier::kFlushedAndSimilar
1115 : PinningTier::kNone);
1116 const bool pin_unpartitioned =
1117 is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
1118 table_options.pin_l0_filter_and_index_blocks_in_cache
1119 ? PinningTier::kFlushedAndSimilar
1120 : PinningTier::kNone);
f67539c2 1121
f67539c2
TL
1122 // pin the first level of index
1123 const bool pin_index =
20effc67
TL
1124 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
1125 ? pin_top_level_index
1126 : pin_unpartitioned;
1127 // prefetch the first level of index
1e59de90
TL
1128 // WART: this might be redundant (unnecessary cache hit) if !pin_index,
1129 // depending on prepopulate_block_cache option
20effc67 1130 const bool prefetch_index = prefetch_all || pin_index;
f67539c2
TL
1131
1132 std::unique_ptr<IndexReader> index_reader;
20effc67 1133 s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
f67539c2
TL
1134 prefetch_index, pin_index, lookup_context,
1135 &index_reader);
1136 if (!s.ok()) {
1137 return s;
1138 }
1139
1140 rep_->index_reader = std::move(index_reader);
1141
1142 // The partitions of partitioned index are always stored in cache. They
1143 // are hence follow the configuration for pin and prefetch regardless of
1144 // the value of cache_index_and_filter_blocks
20effc67
TL
1145 if (prefetch_all || pin_partition) {
1146 s = rep_->index_reader->CacheDependencies(ro, pin_partition);
1147 }
1148 if (!s.ok()) {
1149 return s;
f67539c2
TL
1150 }
1151
f67539c2
TL
1152 // pin the first level of filter
1153 const bool pin_filter =
20effc67
TL
1154 rep_->filter_type == Rep::FilterType::kPartitionedFilter
1155 ? pin_top_level_index
1156 : pin_unpartitioned;
1157 // prefetch the first level of filter
1e59de90
TL
1158 // WART: this might be redundant (unnecessary cache hit) if !pin_filter,
1159 // depending on prepopulate_block_cache option
20effc67 1160 const bool prefetch_filter = prefetch_all || pin_filter;
f67539c2
TL
1161
1162 if (rep_->filter_policy) {
1163 auto filter = new_table->CreateFilterBlockReader(
20effc67 1164 ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
f67539c2 1165 lookup_context);
20effc67 1166
f67539c2
TL
1167 if (filter) {
1168 // Refer to the comment above about paritioned indexes always being cached
20effc67
TL
1169 if (prefetch_all || pin_partition) {
1170 s = filter->CacheDependencies(ro, pin_partition);
1171 if (!s.ok()) {
1172 return s;
1173 }
f67539c2 1174 }
f67539c2
TL
1175 rep_->filter = std::move(filter);
1176 }
1177 }
1178
1179 if (!rep_->compression_dict_handle.IsNull()) {
1180 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
20effc67
TL
1181 s = UncompressionDictReader::Create(
1182 this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
1183 pin_unpartitioned, lookup_context, &uncompression_dict_reader);
f67539c2
TL
1184 if (!s.ok()) {
1185 return s;
1186 }
1187
1188 rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1189 }
1190
1191 assert(s.ok());
1192 return s;
1193}
1194
1195void BlockBasedTable::SetupForCompaction() {
1196 switch (rep_->ioptions.access_hint_on_compaction_start) {
1197 case Options::NONE:
1198 break;
1199 case Options::NORMAL:
1200 rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1201 break;
1202 case Options::SEQUENTIAL:
1203 rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1204 break;
1205 case Options::WILLNEED:
1206 rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1207 break;
1208 default:
1209 assert(false);
1210 }
1211}
1212
1213std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1214 const {
1215 return rep_->table_properties;
1216}
1217
1218size_t BlockBasedTable::ApproximateMemoryUsage() const {
1219 size_t usage = 0;
1e59de90
TL
1220 if (rep_) {
1221 usage += rep_->ApproximateMemoryUsage();
1222 } else {
1223 return usage;
1224 }
f67539c2
TL
1225 if (rep_->filter) {
1226 usage += rep_->filter->ApproximateMemoryUsage();
1227 }
1228 if (rep_->index_reader) {
1229 usage += rep_->index_reader->ApproximateMemoryUsage();
1230 }
1231 if (rep_->uncompression_dict_reader) {
1232 usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1233 }
1e59de90
TL
1234 if (rep_->table_properties) {
1235 usage += rep_->table_properties->ApproximateMemoryUsage();
1236 }
f67539c2
TL
1237 return usage;
1238}
1239
1240// Load the meta-index-block from the file. On success, return the loaded
1241// metaindex
1242// block and its iterator.
1243Status BlockBasedTable::ReadMetaIndexBlock(
20effc67 1244 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
f67539c2
TL
1245 std::unique_ptr<Block>* metaindex_block,
1246 std::unique_ptr<InternalIterator>* iter) {
1247 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1248 // it is an empty block.
1249 std::unique_ptr<Block> metaindex;
1250 Status s = ReadBlockFromFile(
20effc67 1251 rep_->file.get(), prefetch_buffer, rep_->footer, ro,
f67539c2
TL
1252 rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1253 true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1254 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
20effc67
TL
1255 0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
1256 false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
1e59de90 1257 nullptr /* filter_policy */, false /* async_read */);
f67539c2
TL
1258
1259 if (!s.ok()) {
1e59de90 1260 ROCKS_LOG_ERROR(rep_->ioptions.logger,
f67539c2
TL
1261 "Encountered error while reading data from properties"
1262 " block %s",
1263 s.ToString().c_str());
1264 return s;
1265 }
1266
1267 *metaindex_block = std::move(metaindex);
1268 // meta block uses bytewise comparator.
1e59de90 1269 iter->reset(metaindex_block->get()->NewMetaIterator());
f67539c2
TL
1270 return Status::OK();
1271}
1272
1273template <typename TBlocklike>
1274Status BlockBasedTable::GetDataBlockFromCache(
1e59de90
TL
1275 const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
1276 const ReadOptions& read_options,
1277 CachableEntry<TBlocklike>* out_parsed_block,
f67539c2 1278 const UncompressionDict& uncompression_dict, BlockType block_type,
1e59de90 1279 const bool wait, GetContext* get_context) const {
f67539c2
TL
1280 const size_t read_amp_bytes_per_bit =
1281 block_type == BlockType::kData
1282 ? rep_->table_options.read_amp_bytes_per_bit
1283 : 0;
1e59de90
TL
1284 assert(out_parsed_block);
1285 assert(out_parsed_block->IsEmpty());
1286 // Here we treat the legacy name "...index_and_filter_blocks..." to mean all
1287 // metadata blocks that might go into block cache, EXCEPT only those needed
1288 // for the read path (Get, etc.). TableProperties should not be needed on the
1289 // read path (prefix extractor setting is an O(1) size special case that we
1290 // are working not to require from TableProperties), so it is not given
1291 // high-priority treatment if it should go into BlockCache.
1292 const Cache::Priority priority =
1293 rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1294 block_type != BlockType::kData &&
1295 block_type != BlockType::kProperties
1296 ? Cache::Priority::HIGH
1297 : Cache::Priority::LOW;
f67539c2
TL
1298
1299 Status s;
1300 BlockContents* compressed_block = nullptr;
1301 Cache::Handle* block_cache_compressed_handle = nullptr;
1e59de90
TL
1302 Statistics* statistics = rep_->ioptions.statistics.get();
1303 bool using_zstd = rep_->blocks_definitely_zstd_compressed;
1304 const FilterPolicy* filter_policy = rep_->filter_policy;
1305 Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
1306 read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
f67539c2
TL
1307
1308 // Lookup uncompressed cache first
1309 if (block_cache != nullptr) {
1e59de90
TL
1310 assert(!cache_key.empty());
1311 Cache::Handle* cache_handle = nullptr;
1312 cache_handle = GetEntryFromCache(
1313 rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
1314 block_type, wait, get_context,
1315 BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
1316 priority);
f67539c2 1317 if (cache_handle != nullptr) {
1e59de90 1318 out_parsed_block->SetCachedValue(
f67539c2
TL
1319 reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1320 block_cache, cache_handle);
1321 return s;
1322 }
1323 }
1324
1325 // If not found, search from the compressed block cache.
1e59de90 1326 assert(out_parsed_block->IsEmpty());
f67539c2
TL
1327
1328 if (block_cache_compressed == nullptr) {
1329 return s;
1330 }
1331
1e59de90
TL
1332 assert(!cache_key.empty());
1333 BlockContents contents;
f67539c2 1334 block_cache_compressed_handle =
1e59de90 1335 block_cache_compressed->Lookup(cache_key, statistics);
f67539c2
TL
1336
1337 // if we found in the compressed cache, then uncompress and insert into
1338 // uncompressed cache
1339 if (block_cache_compressed_handle == nullptr) {
1340 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1341 return s;
1342 }
1343
1344 // found compressed block
1345 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1346 compressed_block = reinterpret_cast<BlockContents*>(
1347 block_cache_compressed->Value(block_cache_compressed_handle));
1e59de90 1348 CompressionType compression_type = GetBlockCompressionType(*compressed_block);
f67539c2
TL
1349 assert(compression_type != kNoCompression);
1350
1351 // Retrieve the uncompressed contents into a new buffer
f67539c2
TL
1352 UncompressionContext context(compression_type);
1353 UncompressionInfo info(context, uncompression_dict, compression_type);
1e59de90 1354 s = UncompressSerializedBlock(
f67539c2
TL
1355 info, compressed_block->data.data(), compressed_block->data.size(),
1356 &contents, rep_->table_options.format_version, rep_->ioptions,
1357 GetMemoryAllocator(rep_->table_options));
1358
1e59de90
TL
1359 // Insert parsed block into block cache, the priority is based on the
1360 // data block type.
f67539c2
TL
1361 if (s.ok()) {
1362 std::unique_ptr<TBlocklike> block_holder(
1363 BlocklikeTraits<TBlocklike>::Create(
20effc67 1364 std::move(contents), read_amp_bytes_per_bit, statistics,
f67539c2 1365 rep_->blocks_definitely_zstd_compressed,
1e59de90 1366 rep_->table_options.filter_policy.get()));
f67539c2
TL
1367
1368 if (block_cache != nullptr && block_holder->own_bytes() &&
1369 read_options.fill_cache) {
1370 size_t charge = block_holder->ApproximateMemoryUsage();
1371 Cache::Handle* cache_handle = nullptr;
1e59de90
TL
1372 auto block_holder_raw_ptr = block_holder.get();
1373 s = InsertEntryToCache(
1374 rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
1375 BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
1376 std::move(block_holder), charge, &cache_handle, priority);
f67539c2
TL
1377 if (s.ok()) {
1378 assert(cache_handle != nullptr);
1e59de90
TL
1379 out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
1380 cache_handle);
f67539c2 1381
20effc67 1382 UpdateCacheInsertionMetrics(block_type, get_context, charge,
1e59de90 1383 s.IsOkOverwritten(), rep_->ioptions.stats);
f67539c2
TL
1384 } else {
1385 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1386 }
1387 } else {
1e59de90 1388 out_parsed_block->SetOwnedValue(std::move(block_holder));
f67539c2
TL
1389 }
1390 }
1391
1392 // Release hold on compressed cache entry
1393 block_cache_compressed->Release(block_cache_compressed_handle);
1394 return s;
1395}
1396
1397template <typename TBlocklike>
1398Status BlockBasedTable::PutDataBlockToCache(
1e59de90
TL
1399 const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
1400 CachableEntry<TBlocklike>* out_parsed_block, BlockContents&& block_contents,
1401 CompressionType block_comp_type,
20effc67 1402 const UncompressionDict& uncompression_dict,
f67539c2
TL
1403 MemoryAllocator* memory_allocator, BlockType block_type,
1404 GetContext* get_context) const {
1e59de90 1405 const ImmutableOptions& ioptions = rep_->ioptions;
f67539c2
TL
1406 const uint32_t format_version = rep_->table_options.format_version;
1407 const size_t read_amp_bytes_per_bit =
1408 block_type == BlockType::kData
1409 ? rep_->table_options.read_amp_bytes_per_bit
1410 : 0;
1411 const Cache::Priority priority =
1412 rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1e59de90 1413 block_type != BlockType::kData
f67539c2
TL
1414 ? Cache::Priority::HIGH
1415 : Cache::Priority::LOW;
1e59de90
TL
1416 assert(out_parsed_block);
1417 assert(out_parsed_block->IsEmpty());
f67539c2
TL
1418
1419 Status s;
1e59de90 1420 Statistics* statistics = ioptions.stats;
f67539c2
TL
1421
1422 std::unique_ptr<TBlocklike> block_holder;
1e59de90 1423 if (block_comp_type != kNoCompression) {
f67539c2
TL
1424 // Retrieve the uncompressed contents into a new buffer
1425 BlockContents uncompressed_block_contents;
1e59de90
TL
1426 UncompressionContext context(block_comp_type);
1427 UncompressionInfo info(context, uncompression_dict, block_comp_type);
1428 s = UncompressBlockData(info, block_contents.data.data(),
1429 block_contents.data.size(),
1430 &uncompressed_block_contents, format_version,
1431 ioptions, memory_allocator);
f67539c2
TL
1432 if (!s.ok()) {
1433 return s;
1434 }
1435
1436 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
20effc67 1437 std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
f67539c2
TL
1438 statistics, rep_->blocks_definitely_zstd_compressed,
1439 rep_->table_options.filter_policy.get()));
1440 } else {
1441 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1e59de90 1442 std::move(block_contents), read_amp_bytes_per_bit, statistics,
20effc67 1443 rep_->blocks_definitely_zstd_compressed,
f67539c2
TL
1444 rep_->table_options.filter_policy.get()));
1445 }
1446
1447 // Insert compressed block into compressed block cache.
1448 // Release the hold on the compressed cache entry immediately.
1e59de90
TL
1449 if (block_cache_compressed != nullptr && block_comp_type != kNoCompression &&
1450 block_contents.own_bytes()) {
1451 assert(block_contents.has_trailer);
1452 assert(!cache_key.empty());
1453
1454 // We cannot directly put block_contents because this could point to
f67539c2 1455 // an object in the stack.
1e59de90
TL
1456 auto block_cont_for_comp_cache =
1457 std::make_unique<BlockContents>(std::move(block_contents));
1458 size_t charge = block_cont_for_comp_cache->ApproximateMemoryUsage();
1459
f67539c2 1460 s = block_cache_compressed->Insert(
1e59de90
TL
1461 cache_key, block_cont_for_comp_cache.get(), charge,
1462 &DeleteCacheEntry<BlockContents>, nullptr /*handle*/,
1463 Cache::Priority::LOW);
1464
f67539c2 1465 if (s.ok()) {
1e59de90
TL
1466 // Cache took ownership
1467 block_cont_for_comp_cache.release();
f67539c2
TL
1468 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1469 } else {
1470 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
f67539c2
TL
1471 }
1472 }
1473
1474 // insert into uncompressed block cache
1475 if (block_cache != nullptr && block_holder->own_bytes()) {
1476 size_t charge = block_holder->ApproximateMemoryUsage();
1e59de90 1477 auto block_holder_raw_ptr = block_holder.get();
f67539c2 1478 Cache::Handle* cache_handle = nullptr;
1e59de90
TL
1479 s = InsertEntryToCache(
1480 rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
1481 BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
1482 std::move(block_holder), charge, &cache_handle, priority);
f67539c2
TL
1483 if (s.ok()) {
1484 assert(cache_handle != nullptr);
1e59de90
TL
1485 out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
1486 cache_handle);
f67539c2 1487
20effc67 1488 UpdateCacheInsertionMetrics(block_type, get_context, charge,
1e59de90 1489 s.IsOkOverwritten(), rep_->ioptions.stats);
f67539c2
TL
1490 } else {
1491 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1492 }
1493 } else {
1e59de90 1494 out_parsed_block->SetOwnedValue(std::move(block_holder));
f67539c2
TL
1495 }
1496
1497 return s;
1498}
1499
1500std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
20effc67
TL
1501 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache,
1502 bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) {
f67539c2
TL
1503 auto& rep = rep_;
1504 auto filter_type = rep->filter_type;
1505 if (filter_type == Rep::FilterType::kNoFilter) {
1506 return std::unique_ptr<FilterBlockReader>();
1507 }
1508
1509 assert(rep->filter_policy);
1510
1511 switch (filter_type) {
1512 case Rep::FilterType::kPartitionedFilter:
1513 return PartitionedFilterBlockReader::Create(
20effc67 1514 this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
f67539c2 1515
f67539c2 1516 case Rep::FilterType::kFullFilter:
20effc67 1517 return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache,
f67539c2
TL
1518 prefetch, pin, lookup_context);
1519
1520 default:
1521 // filter_type is either kNoFilter (exited the function at the first if),
1522 // or it must be covered in this switch block
1523 assert(false);
1524 return std::unique_ptr<FilterBlockReader>();
1525 }
1526}
1527
1528// disable_prefix_seek should be set to true when prefix_extractor found in SST
1529// differs from the one in mutable_cf_options and index type is HashBasedIndex
1530InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1531 const ReadOptions& read_options, bool disable_prefix_seek,
1532 IndexBlockIter* input_iter, GetContext* get_context,
1533 BlockCacheLookupContext* lookup_context) const {
1534 assert(rep_ != nullptr);
1535 assert(rep_->index_reader != nullptr);
1536
1537 // We don't return pinned data from index blocks, so no need
1538 // to set `block_contents_pinned`.
1539 return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1540 input_iter, get_context,
1541 lookup_context);
1542}
1543
f67539c2
TL
1544template <>
1545DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
20effc67
TL
1546 const Rep* rep, Block* block, BlockType block_type,
1547 DataBlockIter* input_iter, bool block_contents_pinned) {
1548 return block->NewDataIterator(rep->internal_comparator.user_comparator(),
1549 rep->get_global_seqno(block_type), input_iter,
1e59de90 1550 rep->ioptions.stats, block_contents_pinned);
f67539c2
TL
1551}
1552
1553template <>
1554IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
20effc67
TL
1555 const Rep* rep, Block* block, BlockType block_type,
1556 IndexBlockIter* input_iter, bool block_contents_pinned) {
f67539c2 1557 return block->NewIndexIterator(
20effc67 1558 rep->internal_comparator.user_comparator(),
1e59de90 1559 rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
20effc67
TL
1560 /* total_order_seek */ true, rep->index_has_first_key,
1561 rep->index_key_includes_seq, rep->index_value_is_full,
1562 block_contents_pinned);
f67539c2
TL
1563}
1564
1565// If contents is nullptr, this function looks up the block caches for the
1566// data block referenced by handle, and read the block from disk if necessary.
1567// If contents is non-null, it skips the cache lookup and disk read, since
1568// the caller has already read it. In both cases, if ro.fill_cache is true,
1569// it inserts the block into the block cache.
1570template <typename TBlocklike>
1571Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
1572 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1573 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1e59de90
TL
1574 const bool wait, const bool for_compaction,
1575 CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
f67539c2 1576 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1e59de90
TL
1577 BlockContents* contents, bool async_read) const {
1578 assert(out_parsed_block != nullptr);
f67539c2
TL
1579 const bool no_io = (ro.read_tier == kBlockCacheTier);
1580 Cache* block_cache = rep_->table_options.block_cache.get();
f67539c2 1581 Cache* block_cache_compressed =
20effc67 1582 rep_->table_options.block_cache_compressed.get();
f67539c2
TL
1583
1584 // First, try to get the block from the cache
1585 //
1586 // If either block cache is enabled, we'll try to read from it.
1587 Status s;
1e59de90
TL
1588 CacheKey key_data;
1589 Slice key;
f67539c2
TL
1590 bool is_cache_hit = false;
1591 if (block_cache != nullptr || block_cache_compressed != nullptr) {
1592 // create key for block cache
1e59de90
TL
1593 key_data = GetCacheKey(rep_->base_cache_key, handle);
1594 key = key_data.AsSlice();
f67539c2
TL
1595
1596 if (!contents) {
1e59de90
TL
1597 s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro,
1598 out_parsed_block, uncompression_dict,
1599 block_type, wait, get_context);
1600 // Value could still be null at this point, so check the cache handle
1601 // and update the read pattern for prefetching
1602 if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) {
f67539c2
TL
1603 // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
1604 // compressed block cache.
1605 is_cache_hit = true;
1e59de90
TL
1606 if (prefetch_buffer) {
1607 // Update the block details so that PrefetchBuffer can use the read
1608 // pattern to determine if reads are sequential or not for
1609 // prefetching. It should also take in account blocks read from cache.
1610 prefetch_buffer->UpdateReadPattern(
1611 handle.offset(), BlockSizeWithTrailer(handle),
1612 ro.adaptive_readahead /*decrease_readahead_size*/);
1613 }
f67539c2
TL
1614 }
1615 }
1616
1617 // Can't find the block from the cache. If I/O is allowed, read from the
1618 // file.
1e59de90
TL
1619 if (out_parsed_block->GetValue() == nullptr &&
1620 out_parsed_block->GetCacheHandle() == nullptr && !no_io &&
1621 ro.fill_cache) {
1622 Statistics* statistics = rep_->ioptions.stats;
f67539c2
TL
1623 const bool maybe_compressed =
1624 block_type != BlockType::kFilter &&
1625 block_type != BlockType::kCompressionDictionary &&
1626 rep_->blocks_maybe_compressed;
1627 const bool do_uncompress = maybe_compressed && !block_cache_compressed;
1e59de90
TL
1628 CompressionType contents_comp_type;
1629 // Maybe serialized or uncompressed
1630 BlockContents tmp_contents;
f67539c2 1631 if (!contents) {
1e59de90
TL
1632 Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
1633 : READ_BLOCK_GET_MICROS;
1634 StopWatch sw(rep_->ioptions.clock, statistics, histogram);
f67539c2
TL
1635 BlockFetcher block_fetcher(
1636 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
1e59de90
TL
1637 &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed,
1638 block_type, uncompression_dict, rep_->persistent_cache_options,
f67539c2
TL
1639 GetMemoryAllocator(rep_->table_options),
1640 GetMemoryAllocatorForCompressedBlock(rep_->table_options));
1e59de90
TL
1641
1642 // If prefetch_buffer is not allocated, it will fallback to synchronous
1643 // reading of block contents.
1644 if (async_read && prefetch_buffer != nullptr) {
1645 s = block_fetcher.ReadAsyncBlockContents();
1646 if (!s.ok()) {
1647 return s;
1648 }
1649 } else {
1650 s = block_fetcher.ReadBlockContents();
1651 }
1652
1653 contents_comp_type = block_fetcher.get_compression_type();
1654 contents = &tmp_contents;
20effc67
TL
1655 if (get_context) {
1656 switch (block_type) {
1657 case BlockType::kIndex:
1658 ++get_context->get_context_stats_.num_index_read;
1659 break;
1660 case BlockType::kFilter:
1e59de90 1661 case BlockType::kFilterPartitionIndex:
20effc67
TL
1662 ++get_context->get_context_stats_.num_filter_read;
1663 break;
20effc67
TL
1664 default:
1665 break;
1666 }
1667 }
f67539c2 1668 } else {
1e59de90 1669 contents_comp_type = GetBlockCompressionType(*contents);
f67539c2
TL
1670 }
1671
1672 if (s.ok()) {
f67539c2
TL
1673 // If filling cache is allowed and a cache is configured, try to put the
1674 // block to the cache.
1675 s = PutDataBlockToCache(
1e59de90
TL
1676 key, block_cache, block_cache_compressed, out_parsed_block,
1677 std::move(*contents), contents_comp_type, uncompression_dict,
f67539c2
TL
1678 GetMemoryAllocator(rep_->table_options), block_type, get_context);
1679 }
1680 }
1681 }
1682
1683 // Fill lookup_context.
1684 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
1685 lookup_context) {
1686 size_t usage = 0;
1687 uint64_t nkeys = 0;
1e59de90 1688 if (out_parsed_block->GetValue()) {
f67539c2 1689 // Approximate the number of keys in the block using restarts.
1e59de90
TL
1690 nkeys = rep_->table_options.block_restart_interval *
1691 BlocklikeTraits<TBlocklike>::GetNumRestarts(
1692 *out_parsed_block->GetValue());
1693 usage = out_parsed_block->GetValue()->ApproximateMemoryUsage();
f67539c2
TL
1694 }
1695 TraceType trace_block_type = TraceType::kTraceMax;
1696 switch (block_type) {
1697 case BlockType::kData:
1698 trace_block_type = TraceType::kBlockTraceDataBlock;
1699 break;
1700 case BlockType::kFilter:
1e59de90 1701 case BlockType::kFilterPartitionIndex:
f67539c2
TL
1702 trace_block_type = TraceType::kBlockTraceFilterBlock;
1703 break;
1704 case BlockType::kCompressionDictionary:
1705 trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
1706 break;
1707 case BlockType::kRangeDeletion:
1708 trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
1709 break;
1710 case BlockType::kIndex:
1711 trace_block_type = TraceType::kBlockTraceIndexBlock;
1712 break;
1713 default:
1714 // This cannot happen.
1715 assert(false);
1716 break;
1717 }
1718 bool no_insert = no_io || !ro.fill_cache;
1719 if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
1720 trace_block_type, lookup_context->caller)) {
1721 // Defer logging the access to Get() and MultiGet() to trace additional
1722 // information, e.g., referenced_key_exist_in_block.
1723
1724 // Make a copy of the block key here since it will be logged later.
1725 lookup_context->FillLookupContext(
1726 is_cache_hit, no_insert, trace_block_type,
1727 /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
1728 } else {
1729 // Avoid making copy of block_key and cf_name when constructing the access
1730 // record.
1731 BlockCacheTraceRecord access_record(
1e59de90 1732 rep_->ioptions.clock->NowMicros(),
f67539c2
TL
1733 /*block_key=*/"", trace_block_type,
1734 /*block_size=*/usage, rep_->cf_id_for_tracing(),
1735 /*cf_name=*/"", rep_->level_for_tracing(),
1736 rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
1737 no_insert, lookup_context->get_id,
1738 lookup_context->get_from_user_specified_snapshot,
1739 /*referenced_key=*/"");
20effc67
TL
1740 // TODO: Should handle this error?
1741 block_cache_tracer_
1742 ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(),
1743 lookup_context->referenced_key)
1744 .PermitUncheckedError();
f67539c2
TL
1745 }
1746 }
1747
1e59de90 1748 assert(s.ok() || out_parsed_block->GetValue() == nullptr);
f67539c2
TL
1749 return s;
1750}
1751
f67539c2
TL
1752template <typename TBlocklike>
1753Status BlockBasedTable::RetrieveBlock(
1754 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1755 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1e59de90 1756 CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
f67539c2 1757 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1e59de90
TL
1758 bool for_compaction, bool use_cache, bool wait_for_cache,
1759 bool async_read) const {
1760 assert(out_parsed_block);
1761 assert(out_parsed_block->IsEmpty());
f67539c2
TL
1762
1763 Status s;
1764 if (use_cache) {
1765 s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
1e59de90
TL
1766 uncompression_dict, wait_for_cache,
1767 for_compaction, out_parsed_block,
f67539c2 1768 block_type, get_context, lookup_context,
1e59de90 1769 /*contents=*/nullptr, async_read);
f67539c2
TL
1770
1771 if (!s.ok()) {
1772 return s;
1773 }
1774
1e59de90
TL
1775 if (out_parsed_block->GetValue() != nullptr ||
1776 out_parsed_block->GetCacheHandle() != nullptr) {
f67539c2
TL
1777 assert(s.ok());
1778 return s;
1779 }
1780 }
1781
1e59de90 1782 assert(out_parsed_block->IsEmpty());
f67539c2
TL
1783
1784 const bool no_io = ro.read_tier == kBlockCacheTier;
1785 if (no_io) {
1786 return Status::Incomplete("no blocking io");
1787 }
1788
1789 const bool maybe_compressed =
1790 block_type != BlockType::kFilter &&
1791 block_type != BlockType::kCompressionDictionary &&
1792 rep_->blocks_maybe_compressed;
1793 const bool do_uncompress = maybe_compressed;
1794 std::unique_ptr<TBlocklike> block;
1795
1796 {
1e59de90
TL
1797 Histograms histogram =
1798 for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
1799 StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
f67539c2
TL
1800 s = ReadBlockFromFile(
1801 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
1802 rep_->ioptions, do_uncompress, maybe_compressed, block_type,
1803 uncompression_dict, rep_->persistent_cache_options,
f67539c2
TL
1804 block_type == BlockType::kData
1805 ? rep_->table_options.read_amp_bytes_per_bit
1806 : 0,
1807 GetMemoryAllocator(rep_->table_options), for_compaction,
1808 rep_->blocks_definitely_zstd_compressed,
1e59de90 1809 rep_->table_options.filter_policy.get(), async_read);
20effc67
TL
1810
1811 if (get_context) {
1812 switch (block_type) {
1813 case BlockType::kIndex:
1814 ++(get_context->get_context_stats_.num_index_read);
1815 break;
1816 case BlockType::kFilter:
1e59de90 1817 case BlockType::kFilterPartitionIndex:
20effc67
TL
1818 ++(get_context->get_context_stats_.num_filter_read);
1819 break;
20effc67
TL
1820 default:
1821 break;
1822 }
1823 }
f67539c2
TL
1824 }
1825
1826 if (!s.ok()) {
1827 return s;
1828 }
1829
1e59de90 1830 out_parsed_block->SetOwnedValue(std::move(block));
f67539c2
TL
1831
1832 assert(s.ok());
1833 return s;
1834}
1835
1e59de90 1836// Explicitly instantiate templates for each "blocklike" type we use.
f67539c2 1837// This makes it possible to keep the template definitions in the .cc file.
f67539c2
TL
1838template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
1839 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1840 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1e59de90
TL
1841 CachableEntry<ParsedFullFilterBlock>* out_parsed_block,
1842 BlockType block_type, GetContext* get_context,
1843 BlockCacheLookupContext* lookup_context, bool for_compaction,
1844 bool use_cache, bool wait_for_cache, bool async_read) const;
f67539c2
TL
1845
1846template Status BlockBasedTable::RetrieveBlock<Block>(
1847 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1848 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1e59de90 1849 CachableEntry<Block>* out_parsed_block, BlockType block_type,
f67539c2 1850 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1e59de90
TL
1851 bool for_compaction, bool use_cache, bool wait_for_cache,
1852 bool async_read) const;
f67539c2
TL
1853
1854template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
1855 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1856 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1e59de90 1857 CachableEntry<UncompressionDict>* out_parsed_block, BlockType block_type,
f67539c2 1858 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1e59de90
TL
1859 bool for_compaction, bool use_cache, bool wait_for_cache,
1860 bool async_read) const;
f67539c2
TL
1861
1862BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
1863 const BlockBasedTable* table,
1e59de90 1864 UnorderedMap<uint64_t, CachableEntry<Block>>* block_map)
f67539c2
TL
1865 : table_(table), block_map_(block_map) {}
1866
1867InternalIteratorBase<IndexValue>*
1868BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
1869 const BlockHandle& handle) {
1870 // Return a block iterator on the index partition
1871 auto block = block_map_->find(handle.offset());
1e59de90
TL
1872 // block_map_ must be exhaustive
1873 if (block == block_map_->end()) {
1874 assert(false);
1875 // Signal problem to caller
1876 return nullptr;
1877 }
1878 const Rep* rep = table_->get_rep();
1879 assert(rep);
1880
1881 Statistics* kNullStats = nullptr;
1882 // We don't return pinned data from index blocks, so no need
1883 // to set `block_contents_pinned`.
1884 return block->second.GetValue()->NewIndexIterator(
1885 rep->internal_comparator.user_comparator(),
1886 rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
1887 rep->index_has_first_key, rep->index_key_includes_seq,
1888 rep->index_value_is_full);
f67539c2
TL
1889}
1890
1891// This will be broken if the user specifies an unusual implementation
1892// of Options.comparator, or if the user specifies an unusual
1893// definition of prefixes in BlockBasedTableOptions.filter_policy.
1894// In particular, we require the following three properties:
1895//
1896// 1) key.starts_with(prefix(key))
1897// 2) Compare(prefix(key), key) <= 0.
1898// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
1899//
20effc67
TL
1900// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
1901// will return true if the filter block is not in memory and not found in block
1902// cache.
f67539c2
TL
1903//
1904// REQUIRES: this method shouldn't be called while the DB lock is held.
1e59de90 1905bool BlockBasedTable::PrefixRangeMayMatch(
f67539c2
TL
1906 const Slice& internal_key, const ReadOptions& read_options,
1907 const SliceTransform* options_prefix_extractor,
1908 const bool need_upper_bound_check,
1909 BlockCacheLookupContext* lookup_context) const {
1910 if (!rep_->filter_policy) {
1911 return true;
1912 }
1913
1914 const SliceTransform* prefix_extractor;
1915
1916 if (rep_->table_prefix_extractor == nullptr) {
1917 if (need_upper_bound_check) {
1918 return true;
1919 }
1920 prefix_extractor = options_prefix_extractor;
1921 } else {
1922 prefix_extractor = rep_->table_prefix_extractor.get();
1923 }
1e59de90
TL
1924 auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
1925 auto user_key_without_ts =
1926 ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
1927 if (!prefix_extractor->InDomain(user_key_without_ts)) {
f67539c2
TL
1928 return true;
1929 }
1930
1931 bool may_match = true;
f67539c2 1932
f67539c2 1933 FilterBlockReader* const filter = rep_->filter.get();
1e59de90 1934 bool filter_checked = false;
f67539c2 1935 if (filter != nullptr) {
20effc67
TL
1936 const bool no_io = read_options.read_tier == kBlockCacheTier;
1937
1e59de90
TL
1938 const Slice* const const_ikey_ptr = &internal_key;
1939 may_match = filter->RangeMayExist(
1940 read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor,
1941 rep_->internal_comparator.user_comparator(), const_ikey_ptr,
1942 &filter_checked, need_upper_bound_check, no_io, lookup_context,
1943 read_options.rate_limiter_priority);
f67539c2
TL
1944 }
1945
1946 if (filter_checked) {
1e59de90 1947 Statistics* statistics = rep_->ioptions.stats;
f67539c2
TL
1948 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
1949 if (!may_match) {
1950 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
1951 }
1952 }
1953
1954 return may_match;
1955}
1956
1e59de90
TL
1957bool BlockBasedTable::PrefixExtractorChanged(
1958 const SliceTransform* prefix_extractor) const {
1959 if (prefix_extractor == nullptr) {
1960 return true;
1961 } else if (prefix_extractor == rep_->table_prefix_extractor.get()) {
1962 return false;
1963 } else {
1964 return PrefixExtractorChangedHelper(rep_->table_properties.get(),
1965 prefix_extractor);
1966 }
1967}
f67539c2
TL
1968
1969InternalIterator* BlockBasedTable::NewIterator(
1970 const ReadOptions& read_options, const SliceTransform* prefix_extractor,
1971 Arena* arena, bool skip_filters, TableReaderCaller caller,
20effc67 1972 size_t compaction_readahead_size, bool allow_unprepared_value) {
f67539c2
TL
1973 BlockCacheLookupContext lookup_context{caller};
1974 bool need_upper_bound_check =
1e59de90 1975 read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor);
20effc67
TL
1976 std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
1977 read_options,
1e59de90 1978 /*disable_prefix_seek=*/need_upper_bound_check &&
20effc67
TL
1979 rep_->index_type == BlockBasedTableOptions::kHashSearch,
1980 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
f67539c2 1981 if (arena == nullptr) {
20effc67
TL
1982 return new BlockBasedTableIterator(
1983 this, read_options, rep_->internal_comparator, std::move(index_iter),
f67539c2
TL
1984 !skip_filters && !read_options.total_order_seek &&
1985 prefix_extractor != nullptr,
20effc67
TL
1986 need_upper_bound_check, prefix_extractor, caller,
1987 compaction_readahead_size, allow_unprepared_value);
f67539c2 1988 } else {
20effc67
TL
1989 auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
1990 return new (mem) BlockBasedTableIterator(
1991 this, read_options, rep_->internal_comparator, std::move(index_iter),
f67539c2
TL
1992 !skip_filters && !read_options.total_order_seek &&
1993 prefix_extractor != nullptr,
20effc67
TL
1994 need_upper_bound_check, prefix_extractor, caller,
1995 compaction_readahead_size, allow_unprepared_value);
f67539c2
TL
1996 }
1997}
1998
1999FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2000 const ReadOptions& read_options) {
2001 if (rep_->fragmented_range_dels == nullptr) {
2002 return nullptr;
2003 }
2004 SequenceNumber snapshot = kMaxSequenceNumber;
2005 if (read_options.snapshot != nullptr) {
2006 snapshot = read_options.snapshot->GetSequenceNumber();
2007 }
1e59de90
TL
2008 return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels,
2009 rep_->internal_comparator,
2010 snapshot, read_options.timestamp);
f67539c2
TL
2011}
2012
2013bool BlockBasedTable::FullFilterKeyMayMatch(
1e59de90 2014 FilterBlockReader* filter, const Slice& internal_key, const bool no_io,
f67539c2 2015 const SliceTransform* prefix_extractor, GetContext* get_context,
1e59de90
TL
2016 BlockCacheLookupContext* lookup_context,
2017 Env::IOPriority rate_limiter_priority) const {
2018 if (filter == nullptr) {
f67539c2
TL
2019 return true;
2020 }
2021 Slice user_key = ExtractUserKey(internal_key);
2022 const Slice* const const_ikey_ptr = &internal_key;
2023 bool may_match = true;
20effc67
TL
2024 size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
2025 Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
f67539c2 2026 if (rep_->whole_key_filtering) {
f67539c2 2027 may_match =
1e59de90
TL
2028 filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr,
2029 get_context, lookup_context, rate_limiter_priority);
2030 } else if (!PrefixExtractorChanged(prefix_extractor) &&
20effc67
TL
2031 prefix_extractor->InDomain(user_key_without_ts) &&
2032 !filter->PrefixMayMatch(
1e59de90
TL
2033 prefix_extractor->Transform(user_key_without_ts), no_io,
2034 const_ikey_ptr, get_context, lookup_context,
2035 rate_limiter_priority)) {
2036 // FIXME ^^^: there should be no reason for Get() to depend on current
2037 // prefix_extractor at all. It should always use table_prefix_extractor.
f67539c2
TL
2038 may_match = false;
2039 }
2040 if (may_match) {
1e59de90 2041 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
f67539c2
TL
2042 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2043 }
2044 return may_match;
2045}
2046
2047void BlockBasedTable::FullFilterKeysMayMatch(
1e59de90 2048 FilterBlockReader* filter, MultiGetRange* range, const bool no_io,
f67539c2 2049 const SliceTransform* prefix_extractor,
1e59de90
TL
2050 BlockCacheLookupContext* lookup_context,
2051 Env::IOPriority rate_limiter_priority) const {
2052 if (filter == nullptr) {
f67539c2
TL
2053 return;
2054 }
20effc67
TL
2055 uint64_t before_keys = range->KeysLeft();
2056 assert(before_keys > 0); // Caller should ensure
f67539c2 2057 if (rep_->whole_key_filtering) {
1e59de90 2058 filter->KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority);
20effc67
TL
2059 uint64_t after_keys = range->KeysLeft();
2060 if (after_keys) {
1e59de90 2061 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
20effc67
TL
2062 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
2063 rep_->level);
2064 }
2065 uint64_t filtered_keys = before_keys - after_keys;
2066 if (filtered_keys) {
1e59de90 2067 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
20effc67
TL
2068 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
2069 rep_->level);
2070 }
1e59de90
TL
2071 } else if (!PrefixExtractorChanged(prefix_extractor)) {
2072 // FIXME ^^^: there should be no reason for MultiGet() to depend on current
2073 // prefix_extractor at all. It should always use table_prefix_extractor.
2074 filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context,
2075 rate_limiter_priority);
2076 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
20effc67
TL
2077 uint64_t after_keys = range->KeysLeft();
2078 uint64_t filtered_keys = before_keys - after_keys;
2079 if (filtered_keys) {
1e59de90 2080 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
20effc67
TL
2081 filtered_keys);
2082 }
f67539c2
TL
2083 }
2084}
2085
1e59de90
TL
2086Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options,
2087 std::vector<Anchor>& anchors) {
2088 // We iterator the whole index block here. More efficient implementation
2089 // is possible if we push this operation into IndexReader. For example, we
2090 // can directly sample from restart block entries in the index block and
2091 // only read keys needed. Here we take a simple solution. Performance is
2092 // likely not to be a problem. We are compacting the whole file, so all
2093 // keys will be read out anyway. An extra read to index block might be
2094 // a small share of the overhead. We can try to optimize if needed.
2095 IndexBlockIter iiter_on_stack;
2096 auto iiter = NewIndexIterator(
2097 read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
2098 /*get_context=*/nullptr, /*lookup_context=*/nullptr);
2099 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2100 if (iiter != &iiter_on_stack) {
2101 iiter_unique_ptr.reset(iiter);
2102 }
2103
2104 // If needed the threshold could be more adaptive. For example, it can be
2105 // based on size, so that a larger will be sampled to more partitions than a
2106 // smaller file. The size might also need to be passed in by the caller based
2107 // on total compaction size.
2108 const uint64_t kMaxNumAnchors = uint64_t{128};
2109 uint64_t num_blocks = this->GetTableProperties()->num_data_blocks;
2110 uint64_t num_blocks_per_anchor = num_blocks / kMaxNumAnchors;
2111 if (num_blocks_per_anchor == 0) {
2112 num_blocks_per_anchor = 1;
2113 }
2114
2115 uint64_t count = 0;
2116 std::string last_key;
2117 uint64_t range_size = 0;
2118 uint64_t prev_offset = 0;
2119 for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
2120 const BlockHandle& bh = iiter->value().handle;
2121 range_size += bh.offset() + bh.size() - prev_offset;
2122 prev_offset = bh.offset() + bh.size();
2123 if (++count % num_blocks_per_anchor == 0) {
2124 count = 0;
2125 anchors.emplace_back(iiter->user_key(), range_size);
2126 range_size = 0;
2127 } else {
2128 last_key = iiter->user_key().ToString();
2129 }
2130 }
2131 if (count != 0) {
2132 anchors.emplace_back(last_key, range_size);
2133 }
2134 return Status::OK();
2135}
2136
f67539c2
TL
2137Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2138 GetContext* get_context,
2139 const SliceTransform* prefix_extractor,
2140 bool skip_filters) {
2141 assert(key.size() >= 8); // key must be internal key
2142 assert(get_context != nullptr);
2143 Status s;
2144 const bool no_io = read_options.read_tier == kBlockCacheTier;
2145
2146 FilterBlockReader* const filter =
2147 !skip_filters ? rep_->filter.get() : nullptr;
2148
2149 // First check the full filter
2150 // If full filter not useful, Then go into each block
2151 uint64_t tracing_get_id = get_context->get_tracing_get_id();
2152 BlockCacheLookupContext lookup_context{
2153 TableReaderCaller::kUserGet, tracing_get_id,
2154 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2155 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2156 // Trace the key since it contains both user key and sequence number.
2157 lookup_context.referenced_key = key.ToString();
2158 lookup_context.get_from_user_specified_snapshot =
2159 read_options.snapshot != nullptr;
2160 }
20effc67 2161 TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
1e59de90
TL
2162 const bool may_match = FullFilterKeyMayMatch(
2163 filter, key, no_io, prefix_extractor, get_context, &lookup_context,
2164 read_options.rate_limiter_priority);
20effc67 2165 TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
f67539c2 2166 if (!may_match) {
1e59de90 2167 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
f67539c2
TL
2168 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2169 } else {
2170 IndexBlockIter iiter_on_stack;
2171 // if prefix_extractor found in block differs from options, disable
2172 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2173 bool need_upper_bound_check = false;
2174 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
1e59de90 2175 need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
f67539c2
TL
2176 }
2177 auto iiter =
2178 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2179 get_context, &lookup_context);
2180 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2181 if (iiter != &iiter_on_stack) {
2182 iiter_unique_ptr.reset(iiter);
2183 }
2184
2185 size_t ts_sz =
2186 rep_->internal_comparator.user_comparator()->timestamp_size();
20effc67 2187 bool matched = false; // if such user key matched a key in SST
f67539c2
TL
2188 bool done = false;
2189 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2190 IndexValue v = iiter->value();
2191
f67539c2
TL
2192 if (!v.first_internal_key.empty() && !skip_filters &&
2193 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
1e59de90
TL
2194 .CompareWithoutTimestamp(
2195 ExtractUserKey(key),
2196 ExtractUserKey(v.first_internal_key)) < 0) {
f67539c2
TL
2197 // The requested key falls between highest key in previous block and
2198 // lowest key in current block.
2199 break;
2200 }
2201
2202 BlockCacheLookupContext lookup_data_block_context{
2203 TableReaderCaller::kUserGet, tracing_get_id,
2204 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
2205 nullptr};
2206 bool does_referenced_key_exist = false;
2207 DataBlockIter biter;
2208 uint64_t referenced_data_size = 0;
1e59de90 2209 Status tmp_status;
f67539c2
TL
2210 NewDataBlockIterator<DataBlockIter>(
2211 read_options, v.handle, &biter, BlockType::kData, get_context,
1e59de90
TL
2212 &lookup_data_block_context, /*prefetch_buffer=*/nullptr,
2213 /*for_compaction=*/false, /*async_read=*/false, tmp_status);
f67539c2
TL
2214
2215 if (no_io && biter.status().IsIncomplete()) {
2216 // couldn't get block from block_cache
2217 // Update Saver.state to Found because we are only looking for
2218 // whether we can guarantee the key is not there when "no_io" is set
2219 get_context->MarkKeyMayExist();
1e59de90 2220 s = biter.status();
f67539c2
TL
2221 break;
2222 }
2223 if (!biter.status().ok()) {
2224 s = biter.status();
2225 break;
2226 }
2227
2228 bool may_exist = biter.SeekForGet(key);
2229 // If user-specified timestamp is supported, we cannot end the search
2230 // just because hash index lookup indicates the key+ts does not exist.
2231 if (!may_exist && ts_sz == 0) {
2232 // HashSeek cannot find the key this block and the the iter is not
2233 // the end of the block, i.e. cannot be in the following blocks
2234 // either. In this case, the seek_key cannot be found, so we break
2235 // from the top level for-loop.
2236 done = true;
2237 } else {
2238 // Call the *saver function on each entry/block until it returns false
2239 for (; biter.Valid(); biter.Next()) {
2240 ParsedInternalKey parsed_key;
20effc67
TL
2241 Status pik_status = ParseInternalKey(
2242 biter.key(), &parsed_key, false /* log_err_key */); // TODO
2243 if (!pik_status.ok()) {
2244 s = pik_status;
f67539c2
TL
2245 }
2246
2247 if (!get_context->SaveValue(
2248 parsed_key, biter.value(), &matched,
2249 biter.IsValuePinned() ? &biter : nullptr)) {
2250 if (get_context->State() == GetContext::GetState::kFound) {
2251 does_referenced_key_exist = true;
2252 referenced_data_size = biter.key().size() + biter.value().size();
2253 }
2254 done = true;
2255 break;
2256 }
2257 }
2258 s = biter.status();
2259 }
2260 // Write the block cache access record.
2261 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2262 // Avoid making copy of block_key, cf_name, and referenced_key when
2263 // constructing the access record.
2264 Slice referenced_key;
2265 if (does_referenced_key_exist) {
2266 referenced_key = biter.key();
2267 } else {
2268 referenced_key = key;
2269 }
2270 BlockCacheTraceRecord access_record(
1e59de90 2271 rep_->ioptions.clock->NowMicros(),
f67539c2
TL
2272 /*block_key=*/"", lookup_data_block_context.block_type,
2273 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
2274 /*cf_name=*/"", rep_->level_for_tracing(),
2275 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
2276 lookup_data_block_context.is_cache_hit,
2277 lookup_data_block_context.no_insert,
2278 lookup_data_block_context.get_id,
2279 lookup_data_block_context.get_from_user_specified_snapshot,
2280 /*referenced_key=*/"", referenced_data_size,
2281 lookup_data_block_context.num_keys_in_block,
2282 does_referenced_key_exist);
20effc67
TL
2283 // TODO: Should handle status here?
2284 block_cache_tracer_
2285 ->WriteBlockAccess(access_record,
2286 lookup_data_block_context.block_key,
2287 rep_->cf_name_for_tracing(), referenced_key)
2288 .PermitUncheckedError();
f67539c2
TL
2289 }
2290
2291 if (done) {
2292 // Avoid the extra Next which is expensive in two-level indexes
2293 break;
2294 }
2295 }
1e59de90
TL
2296 if (matched && filter != nullptr) {
2297 RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
f67539c2
TL
2298 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
2299 rep_->level);
2300 }
2301 if (s.ok() && !iiter->status().IsNotFound()) {
2302 s = iiter->status();
2303 }
2304 }
2305
2306 return s;
2307}
2308
1e59de90
TL
2309Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
2310 const SliceTransform* prefix_extractor,
2311 MultiGetRange* mget_range) {
20effc67
TL
2312 if (mget_range->empty()) {
2313 // Caller should ensure non-empty (performance bug)
2314 assert(false);
1e59de90 2315 return Status::OK(); // Nothing to do
20effc67
TL
2316 }
2317
1e59de90
TL
2318 FilterBlockReader* const filter = rep_->filter.get();
2319 if (!filter) {
2320 return Status::OK();
2321 }
f67539c2
TL
2322
2323 // First check the full filter
2324 // If full filter not useful, Then go into each block
2325 const bool no_io = read_options.read_tier == kBlockCacheTier;
2326 uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
1e59de90
TL
2327 if (mget_range->begin()->get_context) {
2328 tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id();
f67539c2
TL
2329 }
2330 BlockCacheLookupContext lookup_context{
2331 TableReaderCaller::kUserMultiGet, tracing_mget_id,
1e59de90
TL
2332 /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2333 FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor,
2334 &lookup_context, read_options.rate_limiter_priority);
f67539c2 2335
1e59de90 2336 return Status::OK();
f67539c2
TL
2337}
2338
2339Status BlockBasedTable::Prefetch(const Slice* const begin,
2340 const Slice* const end) {
2341 auto& comparator = rep_->internal_comparator;
2342 UserComparatorWrapper user_comparator(comparator.user_comparator());
2343 // pre-condition
2344 if (begin && end && comparator.Compare(*begin, *end) > 0) {
2345 return Status::InvalidArgument(*begin, *end);
2346 }
2347 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
2348 IndexBlockIter iiter_on_stack;
2349 auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2350 &iiter_on_stack, /*get_context=*/nullptr,
2351 &lookup_context);
2352 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2353 if (iiter != &iiter_on_stack) {
2354 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2355 }
2356
2357 if (!iiter->status().ok()) {
2358 // error opening index iterator
2359 return iiter->status();
2360 }
2361
2362 // indicates if we are on the last page that need to be pre-fetched
2363 bool prefetching_boundary_page = false;
2364
2365 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
2366 iiter->Next()) {
2367 BlockHandle block_handle = iiter->value().handle;
2368 const bool is_user_key = !rep_->index_key_includes_seq;
2369 if (end &&
2370 ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
2371 (is_user_key &&
2372 user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
2373 if (prefetching_boundary_page) {
2374 break;
2375 }
2376
2377 // The index entry represents the last key in the data block.
2378 // We should load this page into memory as well, but no more
2379 prefetching_boundary_page = true;
2380 }
2381
2382 // Load the block specified by the block_handle into the block cache
2383 DataBlockIter biter;
1e59de90 2384 Status tmp_status;
f67539c2
TL
2385 NewDataBlockIterator<DataBlockIter>(
2386 ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
1e59de90
TL
2387 /*get_context=*/nullptr, &lookup_context,
2388 /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
2389 /*async_read=*/false, tmp_status);
f67539c2
TL
2390
2391 if (!biter.status().ok()) {
2392 // there was an unexpected error while pre-fetching
2393 return biter.status();
2394 }
2395 }
2396
2397 return Status::OK();
2398}
2399
2400Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
2401 TableReaderCaller caller) {
2402 Status s;
2403 // Check Meta blocks
2404 std::unique_ptr<Block> metaindex;
2405 std::unique_ptr<InternalIterator> metaindex_iter;
20effc67
TL
2406 ReadOptions ro;
2407 s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex,
f67539c2
TL
2408 &metaindex_iter);
2409 if (s.ok()) {
2410 s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
2411 if (!s.ok()) {
2412 return s;
2413 }
2414 } else {
2415 return s;
2416 }
2417 // Check Data blocks
2418 IndexBlockIter iiter_on_stack;
2419 BlockCacheLookupContext context{caller};
2420 InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
2421 read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
2422 /*get_context=*/nullptr, &context);
2423 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2424 if (iiter != &iiter_on_stack) {
2425 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2426 }
2427 if (!iiter->status().ok()) {
2428 // error opening index iterator
2429 return iiter->status();
2430 }
2431 s = VerifyChecksumInBlocks(read_options, iiter);
2432 return s;
2433}
2434
2435Status BlockBasedTable::VerifyChecksumInBlocks(
2436 const ReadOptions& read_options,
2437 InternalIteratorBase<IndexValue>* index_iter) {
2438 Status s;
2439 // We are scanning the whole file, so no need to do exponential
2440 // increasing of the buffer size.
2441 size_t readahead_size = (read_options.readahead_size != 0)
2442 ? read_options.readahead_size
1e59de90 2443 : rep_->table_options.max_auto_readahead_size;
f67539c2
TL
2444 // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
2445 // needed there.
2446 FilePrefetchBuffer prefetch_buffer(
1e59de90 2447 readahead_size /* readahead_size */,
f67539c2
TL
2448 readahead_size /* max_readahead_size */,
2449 !rep_->ioptions.allow_mmap_reads /* enable */);
2450
2451 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2452 s = index_iter->status();
2453 if (!s.ok()) {
2454 break;
2455 }
2456 BlockHandle handle = index_iter->value().handle;
2457 BlockContents contents;
2458 BlockFetcher block_fetcher(
1e59de90 2459 rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
f67539c2
TL
2460 &contents, rep_->ioptions, false /* decompress */,
2461 false /*maybe_compressed*/, BlockType::kData,
2462 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
2463 s = block_fetcher.ReadBlockContents();
2464 if (!s.ok()) {
2465 break;
2466 }
2467 }
20effc67
TL
2468 if (s.ok()) {
2469 // In the case of two level indexes, we would have exited the above loop
2470 // by checking index_iter->Valid(), but Valid() might have returned false
2471 // due to an IO error. So check the index_iter status
2472 s = index_iter->status();
2473 }
f67539c2
TL
2474 return s;
2475}
2476
2477BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
2478 const Slice& meta_block_name) {
1e59de90 2479 if (meta_block_name.starts_with(kFullFilterBlockPrefix)) {
f67539c2
TL
2480 return BlockType::kFilter;
2481 }
2482
1e59de90
TL
2483 if (meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
2484 return BlockType::kFilterPartitionIndex;
2485 }
2486
2487 if (meta_block_name == kPropertiesBlockName) {
f67539c2
TL
2488 return BlockType::kProperties;
2489 }
2490
1e59de90 2491 if (meta_block_name == kCompressionDictBlockName) {
f67539c2
TL
2492 return BlockType::kCompressionDictionary;
2493 }
2494
1e59de90 2495 if (meta_block_name == kRangeDelBlockName) {
f67539c2
TL
2496 return BlockType::kRangeDeletion;
2497 }
2498
2499 if (meta_block_name == kHashIndexPrefixesBlock) {
2500 return BlockType::kHashIndexPrefixes;
2501 }
2502
2503 if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
2504 return BlockType::kHashIndexMetadata;
2505 }
2506
1e59de90
TL
2507 if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
2508 // Obsolete but possible in old files
2509 return BlockType::kInvalid;
2510 }
2511
f67539c2
TL
2512 assert(false);
2513 return BlockType::kInvalid;
2514}
2515
2516Status BlockBasedTable::VerifyChecksumInMetaBlocks(
2517 InternalIteratorBase<Slice>* index_iter) {
2518 Status s;
2519 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2520 s = index_iter->status();
2521 if (!s.ok()) {
2522 break;
2523 }
2524 BlockHandle handle;
2525 Slice input = index_iter->value();
2526 s = handle.DecodeFrom(&input);
2527 BlockContents contents;
2528 const Slice meta_block_name = index_iter->key();
1e59de90
TL
2529 if (meta_block_name == kPropertiesBlockName) {
2530 // Unfortunate special handling for properties block checksum w/
2531 // global seqno
2532 std::unique_ptr<TableProperties> table_properties;
2533 s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(),
2534 nullptr /* prefetch_buffer */, rep_->footer,
2535 rep_->ioptions, &table_properties,
2536 nullptr /* memory_allocator */);
2537 } else {
2538 s = BlockFetcher(
2539 rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
2540 ReadOptions(), handle, &contents, rep_->ioptions,
2541 false /* decompress */, false /*maybe_compressed*/,
2542 GetBlockTypeForMetaBlockByName(meta_block_name),
2543 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
2544 .ReadBlockContents();
f67539c2
TL
2545 }
2546 if (!s.ok()) {
2547 break;
2548 }
2549 }
2550 return s;
2551}
2552
2553bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
2554 assert(rep_ != nullptr);
2555
2556 Cache* const cache = rep_->table_options.block_cache.get();
2557 if (cache == nullptr) {
2558 return false;
2559 }
2560
1e59de90 2561 CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
f67539c2 2562
1e59de90 2563 Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
f67539c2
TL
2564 if (cache_handle == nullptr) {
2565 return false;
2566 }
2567
2568 cache->Release(cache_handle);
2569
2570 return true;
2571}
2572
2573bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
2574 const Slice& key) {
2575 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2576 options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2577 /*get_context=*/nullptr, /*lookup_context=*/nullptr));
2578 iiter->Seek(key);
2579 assert(iiter->Valid());
2580
2581 return TEST_BlockInCache(iiter->value().handle);
2582}
2583
2584// REQUIRES: The following fields of rep_ should have already been populated:
2585// 1. file
2586// 2. index_handle,
2587// 3. options
2588// 4. internal_comparator
2589// 5. index_type
2590Status BlockBasedTable::CreateIndexReader(
20effc67 2591 const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
1e59de90
TL
2592 InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
2593 BlockCacheLookupContext* lookup_context,
f67539c2 2594 std::unique_ptr<IndexReader>* index_reader) {
f67539c2
TL
2595 switch (rep_->index_type) {
2596 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
20effc67 2597 return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache,
f67539c2
TL
2598 prefetch, pin, lookup_context,
2599 index_reader);
2600 }
2601 case BlockBasedTableOptions::kBinarySearch:
2602 FALLTHROUGH_INTENDED;
2603 case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
20effc67
TL
2604 return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
2605 use_cache, prefetch, pin,
2606 lookup_context, index_reader);
f67539c2
TL
2607 }
2608 case BlockBasedTableOptions::kHashSearch: {
1e59de90
TL
2609 if (!rep_->table_prefix_extractor) {
2610 ROCKS_LOG_WARN(rep_->ioptions.logger,
2611 "Missing prefix extractor for hash index. Fall back to"
2612 " binary search index.");
20effc67
TL
2613 return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
2614 use_cache, prefetch, pin,
2615 lookup_context, index_reader);
f67539c2 2616 } else {
1e59de90
TL
2617 return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
2618 use_cache, prefetch, pin, lookup_context,
2619 index_reader);
f67539c2
TL
2620 }
2621 }
2622 default: {
2623 std::string error_message =
1e59de90 2624 "Unrecognized index type: " + std::to_string(rep_->index_type);
f67539c2
TL
2625 return Status::InvalidArgument(error_message.c_str());
2626 }
2627 }
2628}
2629
20effc67
TL
2630uint64_t BlockBasedTable::ApproximateDataOffsetOf(
2631 const InternalIteratorBase<IndexValue>& index_iter,
2632 uint64_t data_size) const {
1e59de90 2633 assert(index_iter.status().ok());
f67539c2
TL
2634 if (index_iter.Valid()) {
2635 BlockHandle handle = index_iter.value().handle;
20effc67 2636 return handle.offset();
f67539c2 2637 } else {
20effc67
TL
2638 // The iterator is past the last key in the file.
2639 return data_size;
f67539c2 2640 }
20effc67 2641}
f67539c2 2642
20effc67
TL
2643uint64_t BlockBasedTable::GetApproximateDataSize() {
2644 // Should be in table properties unless super old version
2645 if (rep_->table_properties) {
2646 return rep_->table_properties->data_size;
2647 }
2648 // Fall back to rough estimate from footer
2649 return rep_->footer.metaindex_handle().offset();
f67539c2
TL
2650}
2651
2652uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
2653 TableReaderCaller caller) {
20effc67
TL
2654 uint64_t data_size = GetApproximateDataSize();
2655 if (UNLIKELY(data_size == 0)) {
2656 // Hmm. Let's just split in half to avoid skewing one way or another,
2657 // since we don't know whether we're operating on lower bound or
2658 // upper bound.
2659 return rep_->file_size / 2;
2660 }
2661
f67539c2
TL
2662 BlockCacheLookupContext context(caller);
2663 IndexBlockIter iiter_on_stack;
2664 ReadOptions ro;
2665 ro.total_order_seek = true;
2666 auto index_iter =
2667 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2668 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2669 /*lookup_context=*/&context);
2670 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2671 if (index_iter != &iiter_on_stack) {
2672 iiter_unique_ptr.reset(index_iter);
2673 }
2674
2675 index_iter->Seek(key);
1e59de90
TL
2676 uint64_t offset;
2677 if (index_iter->status().ok()) {
2678 offset = ApproximateDataOffsetOf(*index_iter, data_size);
2679 } else {
2680 // Split in half to avoid skewing one way or another,
2681 // since we don't know whether we're operating on lower bound or
2682 // upper bound.
2683 return rep_->file_size / 2;
2684 }
20effc67 2685
20effc67
TL
2686 // Pro-rate file metadata (incl filters) size-proportionally across data
2687 // blocks.
2688 double size_ratio =
2689 static_cast<double>(offset) / static_cast<double>(data_size);
2690 return static_cast<uint64_t>(size_ratio *
2691 static_cast<double>(rep_->file_size));
f67539c2
TL
2692}
2693
2694uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
2695 TableReaderCaller caller) {
2696 assert(rep_->internal_comparator.Compare(start, end) <= 0);
2697
20effc67
TL
2698 uint64_t data_size = GetApproximateDataSize();
2699 if (UNLIKELY(data_size == 0)) {
2700 // Hmm. Assume whole file is involved, since we have lower and upper
1e59de90
TL
2701 // bound. This likely skews the estimate if we consider that this function
2702 // is typically called with `[start, end]` fully contained in the file's
2703 // key-range.
20effc67
TL
2704 return rep_->file_size;
2705 }
2706
f67539c2
TL
2707 BlockCacheLookupContext context(caller);
2708 IndexBlockIter iiter_on_stack;
2709 ReadOptions ro;
2710 ro.total_order_seek = true;
2711 auto index_iter =
2712 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2713 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2714 /*lookup_context=*/&context);
2715 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2716 if (index_iter != &iiter_on_stack) {
2717 iiter_unique_ptr.reset(index_iter);
2718 }
2719
2720 index_iter->Seek(start);
1e59de90
TL
2721 uint64_t start_offset;
2722 if (index_iter->status().ok()) {
2723 start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
2724 } else {
2725 // Assume file is involved from the start. This likely skews the estimate
2726 // but is consistent with the above error handling.
2727 start_offset = 0;
2728 }
2729
f67539c2 2730 index_iter->Seek(end);
1e59de90
TL
2731 uint64_t end_offset;
2732 if (index_iter->status().ok()) {
2733 end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
2734 } else {
2735 // Assume file is involved until the end. This likely skews the estimate
2736 // but is consistent with the above error handling.
2737 end_offset = data_size;
2738 }
f67539c2
TL
2739
2740 assert(end_offset >= start_offset);
20effc67
TL
2741 // Pro-rate file metadata (incl filters) size-proportionally across data
2742 // blocks.
2743 double size_ratio = static_cast<double>(end_offset - start_offset) /
2744 static_cast<double>(data_size);
2745 return static_cast<uint64_t>(size_ratio *
2746 static_cast<double>(rep_->file_size));
f67539c2
TL
2747}
2748
2749bool BlockBasedTable::TEST_FilterBlockInCache() const {
2750 assert(rep_ != nullptr);
1e59de90
TL
2751 return rep_->filter_type != Rep::FilterType::kNoFilter &&
2752 TEST_BlockInCache(rep_->filter_handle);
f67539c2
TL
2753}
2754
2755bool BlockBasedTable::TEST_IndexBlockInCache() const {
2756 assert(rep_ != nullptr);
2757
2758 return TEST_BlockInCache(rep_->footer.index_handle());
2759}
2760
2761Status BlockBasedTable::GetKVPairsFromDataBlocks(
2762 std::vector<KVPairBlock>* kv_pair_blocks) {
2763 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
2764 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2765 /*input_iter=*/nullptr, /*get_context=*/nullptr,
2766 /*lookup_contex=*/nullptr));
2767
2768 Status s = blockhandles_iter->status();
2769 if (!s.ok()) {
2770 // Cannot read Index Block
2771 return s;
2772 }
2773
2774 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2775 blockhandles_iter->Next()) {
2776 s = blockhandles_iter->status();
2777
2778 if (!s.ok()) {
2779 break;
2780 }
2781
2782 std::unique_ptr<InternalIterator> datablock_iter;
1e59de90 2783 Status tmp_status;
f67539c2
TL
2784 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
2785 ReadOptions(), blockhandles_iter->value().handle,
2786 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
1e59de90
TL
2787 /*get_context=*/nullptr, /*lookup_context=*/nullptr,
2788 /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
2789 /*async_read=*/false, tmp_status));
f67539c2
TL
2790 s = datablock_iter->status();
2791
2792 if (!s.ok()) {
2793 // Error reading the block - Skipped
2794 continue;
2795 }
2796
2797 KVPairBlock kv_pair_block;
2798 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
2799 datablock_iter->Next()) {
2800 s = datablock_iter->status();
2801 if (!s.ok()) {
2802 // Error reading the block - Skipped
2803 break;
2804 }
2805 const Slice& key = datablock_iter->key();
2806 const Slice& value = datablock_iter->value();
2807 std::string key_copy = std::string(key.data(), key.size());
2808 std::string value_copy = std::string(value.data(), value.size());
2809
2810 kv_pair_block.push_back(
2811 std::make_pair(std::move(key_copy), std::move(value_copy)));
2812 }
2813 kv_pair_blocks->push_back(std::move(kv_pair_block));
2814 }
2815 return Status::OK();
2816}
2817
2818Status BlockBasedTable::DumpTable(WritableFile* out_file) {
20effc67
TL
2819 WritableFileStringStreamAdapter out_file_wrapper(out_file);
2820 std::ostream out_stream(&out_file_wrapper);
f67539c2 2821 // Output Footer
20effc67
TL
2822 out_stream << "Footer Details:\n"
2823 "--------------------------------------\n";
2824 out_stream << " " << rep_->footer.ToString() << "\n";
f67539c2
TL
2825
2826 // Output MetaIndex
20effc67
TL
2827 out_stream << "Metaindex Details:\n"
2828 "--------------------------------------\n";
f67539c2
TL
2829 std::unique_ptr<Block> metaindex;
2830 std::unique_ptr<InternalIterator> metaindex_iter;
20effc67
TL
2831 ReadOptions ro;
2832 Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
f67539c2
TL
2833 &metaindex_iter);
2834 if (s.ok()) {
2835 for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
2836 metaindex_iter->Next()) {
2837 s = metaindex_iter->status();
2838 if (!s.ok()) {
2839 return s;
2840 }
1e59de90 2841 if (metaindex_iter->key() == kPropertiesBlockName) {
20effc67
TL
2842 out_stream << " Properties block handle: "
2843 << metaindex_iter->value().ToString(true) << "\n";
1e59de90 2844 } else if (metaindex_iter->key() == kCompressionDictBlockName) {
20effc67
TL
2845 out_stream << " Compression dictionary block handle: "
2846 << metaindex_iter->value().ToString(true) << "\n";
f67539c2
TL
2847 } else if (strstr(metaindex_iter->key().ToString().c_str(),
2848 "filter.rocksdb.") != nullptr) {
20effc67
TL
2849 out_stream << " Filter block handle: "
2850 << metaindex_iter->value().ToString(true) << "\n";
1e59de90 2851 } else if (metaindex_iter->key() == kRangeDelBlockName) {
20effc67
TL
2852 out_stream << " Range deletion block handle: "
2853 << metaindex_iter->value().ToString(true) << "\n";
f67539c2
TL
2854 }
2855 }
20effc67 2856 out_stream << "\n";
f67539c2
TL
2857 } else {
2858 return s;
2859 }
2860
2861 // Output TableProperties
2862 const ROCKSDB_NAMESPACE::TableProperties* table_properties;
2863 table_properties = rep_->table_properties.get();
2864
2865 if (table_properties != nullptr) {
20effc67
TL
2866 out_stream << "Table Properties:\n"
2867 "--------------------------------------\n";
2868 out_stream << " " << table_properties->ToString("\n ", ": ") << "\n";
f67539c2
TL
2869 }
2870
2871 if (rep_->filter) {
20effc67
TL
2872 out_stream << "Filter Details:\n"
2873 "--------------------------------------\n";
2874 out_stream << " " << rep_->filter->ToString() << "\n";
f67539c2
TL
2875 }
2876
2877 // Output Index block
20effc67 2878 s = DumpIndexBlock(out_stream);
f67539c2
TL
2879 if (!s.ok()) {
2880 return s;
2881 }
2882
2883 // Output compression dictionary
2884 if (rep_->uncompression_dict_reader) {
2885 CachableEntry<UncompressionDict> uncompression_dict;
2886 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
2887 nullptr /* prefetch_buffer */, false /* no_io */,
1e59de90 2888 false, /* verify_checksums */
f67539c2
TL
2889 nullptr /* get_context */, nullptr /* lookup_context */,
2890 &uncompression_dict);
2891 if (!s.ok()) {
2892 return s;
2893 }
2894
2895 assert(uncompression_dict.GetValue());
2896
2897 const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
20effc67
TL
2898 out_stream << "Compression Dictionary:\n"
2899 "--------------------------------------\n";
2900 out_stream << " size (bytes): " << raw_dict.size() << "\n\n";
2901 out_stream << " HEX " << raw_dict.ToString(true) << "\n\n";
f67539c2
TL
2902 }
2903
2904 // Output range deletions block
2905 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
2906 if (range_del_iter != nullptr) {
2907 range_del_iter->SeekToFirst();
2908 if (range_del_iter->Valid()) {
20effc67
TL
2909 out_stream << "Range deletions:\n"
2910 "--------------------------------------\n";
f67539c2 2911 for (; range_del_iter->Valid(); range_del_iter->Next()) {
20effc67
TL
2912 DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
2913 out_stream);
f67539c2 2914 }
20effc67 2915 out_stream << "\n";
f67539c2
TL
2916 }
2917 delete range_del_iter;
2918 }
2919 // Output Data blocks
20effc67 2920 s = DumpDataBlocks(out_stream);
f67539c2 2921
20effc67
TL
2922 if (!s.ok()) {
2923 return s;
2924 }
2925
2926 if (!out_stream.good()) {
2927 return Status::IOError("Failed to write to output file");
2928 }
2929 return Status::OK();
f67539c2
TL
2930}
2931
20effc67
TL
2932Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
2933 out_stream << "Index Details:\n"
2934 "--------------------------------------\n";
f67539c2
TL
2935 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
2936 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2937 /*input_iter=*/nullptr, /*get_context=*/nullptr,
2938 /*lookup_contex=*/nullptr));
2939 Status s = blockhandles_iter->status();
2940 if (!s.ok()) {
20effc67 2941 out_stream << "Can not read Index Block \n\n";
f67539c2
TL
2942 return s;
2943 }
2944
20effc67
TL
2945 out_stream << " Block key hex dump: Data block handle\n";
2946 out_stream << " Block key ascii\n\n";
f67539c2
TL
2947 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
2948 blockhandles_iter->Next()) {
2949 s = blockhandles_iter->status();
2950 if (!s.ok()) {
2951 break;
2952 }
2953 Slice key = blockhandles_iter->key();
2954 Slice user_key;
2955 InternalKey ikey;
2956 if (!rep_->index_key_includes_seq) {
2957 user_key = key;
2958 } else {
2959 ikey.DecodeFrom(key);
2960 user_key = ikey.user_key();
2961 }
2962
20effc67
TL
2963 out_stream << " HEX " << user_key.ToString(true) << ": "
2964 << blockhandles_iter->value().ToString(true,
2965 rep_->index_has_first_key)
1e59de90
TL
2966 << " offset " << blockhandles_iter->value().handle.offset()
2967 << " size " << blockhandles_iter->value().handle.size() << "\n";
f67539c2
TL
2968
2969 std::string str_key = user_key.ToString();
2970 std::string res_key("");
2971 char cspace = ' ';
2972 for (size_t i = 0; i < str_key.size(); i++) {
2973 res_key.append(&str_key[i], 1);
2974 res_key.append(1, cspace);
2975 }
20effc67
TL
2976 out_stream << " ASCII " << res_key << "\n";
2977 out_stream << " ------\n";
f67539c2 2978 }
20effc67 2979 out_stream << "\n";
f67539c2
TL
2980 return Status::OK();
2981}
2982
20effc67 2983Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
f67539c2
TL
2984 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
2985 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2986 /*input_iter=*/nullptr, /*get_context=*/nullptr,
2987 /*lookup_contex=*/nullptr));
2988 Status s = blockhandles_iter->status();
2989 if (!s.ok()) {
20effc67 2990 out_stream << "Can not read Index Block \n\n";
f67539c2
TL
2991 return s;
2992 }
2993
2994 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
2995 uint64_t datablock_size_max = 0;
2996 uint64_t datablock_size_sum = 0;
2997
2998 size_t block_id = 1;
2999 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3000 block_id++, blockhandles_iter->Next()) {
3001 s = blockhandles_iter->status();
3002 if (!s.ok()) {
3003 break;
3004 }
3005
3006 BlockHandle bh = blockhandles_iter->value().handle;
3007 uint64_t datablock_size = bh.size();
3008 datablock_size_min = std::min(datablock_size_min, datablock_size);
3009 datablock_size_max = std::max(datablock_size_max, datablock_size);
3010 datablock_size_sum += datablock_size;
3011
20effc67
TL
3012 out_stream << "Data Block # " << block_id << " @ "
3013 << blockhandles_iter->value().handle.ToString(true) << "\n";
3014 out_stream << "--------------------------------------\n";
f67539c2
TL
3015
3016 std::unique_ptr<InternalIterator> datablock_iter;
1e59de90 3017 Status tmp_status;
f67539c2
TL
3018 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3019 ReadOptions(), blockhandles_iter->value().handle,
3020 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
1e59de90
TL
3021 /*get_context=*/nullptr, /*lookup_context=*/nullptr,
3022 /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
3023 /*async_read=*/false, tmp_status));
f67539c2
TL
3024 s = datablock_iter->status();
3025
3026 if (!s.ok()) {
20effc67 3027 out_stream << "Error reading the block - Skipped \n\n";
f67539c2
TL
3028 continue;
3029 }
3030
3031 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3032 datablock_iter->Next()) {
3033 s = datablock_iter->status();
3034 if (!s.ok()) {
20effc67 3035 out_stream << "Error reading the block - Skipped \n";
f67539c2
TL
3036 break;
3037 }
20effc67 3038 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
f67539c2 3039 }
20effc67 3040 out_stream << "\n";
f67539c2
TL
3041 }
3042
3043 uint64_t num_datablocks = block_id - 1;
3044 if (num_datablocks) {
3045 double datablock_size_avg =
3046 static_cast<double>(datablock_size_sum) / num_datablocks;
20effc67
TL
3047 out_stream << "Data Block Summary:\n";
3048 out_stream << "--------------------------------------\n";
3049 out_stream << " # data blocks: " << num_datablocks << "\n";
3050 out_stream << " min data block size: " << datablock_size_min << "\n";
3051 out_stream << " max data block size: " << datablock_size_max << "\n";
1e59de90
TL
3052 out_stream << " avg data block size: "
3053 << std::to_string(datablock_size_avg) << "\n";
f67539c2
TL
3054 }
3055
3056 return Status::OK();
3057}
3058
3059void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
20effc67 3060 std::ostream& out_stream) {
f67539c2
TL
3061 InternalKey ikey;
3062 ikey.DecodeFrom(key);
3063
20effc67
TL
3064 out_stream << " HEX " << ikey.user_key().ToString(true) << ": "
3065 << value.ToString(true) << "\n";
f67539c2
TL
3066
3067 std::string str_key = ikey.user_key().ToString();
3068 std::string str_value = value.ToString();
3069 std::string res_key(""), res_value("");
3070 char cspace = ' ';
3071 for (size_t i = 0; i < str_key.size(); i++) {
3072 if (str_key[i] == '\0') {
3073 res_key.append("\\0", 2);
3074 } else {
3075 res_key.append(&str_key[i], 1);
3076 }
3077 res_key.append(1, cspace);
3078 }
3079 for (size_t i = 0; i < str_value.size(); i++) {
3080 if (str_value[i] == '\0') {
3081 res_value.append("\\0", 2);
3082 } else {
3083 res_value.append(&str_value[i], 1);
3084 }
3085 res_value.append(1, cspace);
3086 }
3087
20effc67
TL
3088 out_stream << " ASCII " << res_key << ": " << res_value << "\n";
3089 out_stream << " ------\n";
f67539c2
TL
3090}
3091
3092} // namespace ROCKSDB_NAMESPACE