// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
#include "port/stack_trace.h"
#include "rocksdb/comparator.h"
#include "table/block_prefix_index.h"
+#include "table/data_block_footer.h"
#include "table/format.h"
#include "util/coding.h"
#include "util/logging.h"
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
-static inline const char* DecodeEntry(const char* p, const char* limit,
- uint32_t* shared,
- uint32_t* non_shared,
- uint32_t* value_length) {
- if (limit - p < 3) return nullptr;
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
- *value_length = reinterpret_cast<const unsigned char*>(p)[2];
- if ((*shared | *non_shared | *value_length) < 128) {
- // Fast path: all three values are encoded in one byte each
- p += 3;
- } else {
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
- if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
+struct DecodeEntry {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared,
+ uint32_t* value_length) {
+ // We need 2 bytes for shared and non_shared size. We also need one more
+ // byte either for value size or the actual value in case of value delta
+ // encoding.
+ assert(limit - p >= 3);
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+ if ((*shared | *non_shared | *value_length) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 3;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+ return nullptr;
+ }
+ }
+
+ // Using an assert in place of "return null" since we should not pay the
+ // cost of checking for corruption on every single key decoding
+ assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+ return p;
}
+};
- if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
- return nullptr;
+struct DecodeKey {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared) {
+ uint32_t value_length;
+ return DecodeEntry()(p, limit, shared, non_shared, &value_length);
}
- return p;
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared) {
+ // We need 2 bytes for shared and non_shared size. We also need one more
+ // byte either for value size or the actual value in case of value delta
+ // encoding.
+ if (limit - p < 3) return nullptr;
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ if ((*shared | *non_shared) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 2;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+ }
+ return p;
+ }
+};
+
+void DataBlockIter::Next() {
+ assert(Valid());
+ ParseNextDataKey();
}
-void BlockIter::Next() {
+void IndexBlockIter::Next() {
assert(Valid());
- ParseNextKey();
+ ParseNextIndexKey();
}
-void BlockIter::Prev() {
+void IndexBlockIter::Prev() {
+ assert(Valid());
+ // Scan backwards to a restart point before current_
+ const uint32_t original = current_;
+ while (GetRestartPoint(restart_index_) >= original) {
+ if (restart_index_ == 0) {
+ // No more entries
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return;
+ }
+ restart_index_--;
+ }
+ SeekToRestartPoint(restart_index_);
+ do {
+ if (!ParseNextIndexKey()) {
+ break;
+ }
+ // Loop until end of current entry hits the start of original entry
+ } while (NextEntryOffset() < original);
+}
+
+// Similar to IndexBlockIter::Prev but also caches the prev entries
+void DataBlockIter::Prev() {
assert(Valid());
assert(prev_entries_idx_ == -1 ||
const Slice current_key(key_ptr, current_prev_entry.key_size);
current_ = current_prev_entry.offset;
- key_.SetInternalKey(current_key, false /* copy */);
+ key_.SetKey(current_key, false /* copy */);
value_ = current_prev_entry.value;
return;
SeekToRestartPoint(restart_index_);
do {
- if (!ParseNextKey()) {
+ if (!ParseNextDataKey()) {
break;
}
Slice current_key = key();
prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
}
-void BlockIter::Seek(const Slice& target) {
+void DataBlockIter::Seek(const Slice& target) {
+ Slice seek_key = target;
+ PERF_TIMER_GUARD(block_seek_nanos);
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ uint32_t index = 0;
+ bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
+
+ if (!ok) {
+ return;
+ }
+ SeekToRestartPoint(index);
+ // Linear search (within restart block) for first key >= target
+
+ while (true) {
+ if (!ParseNextDataKey() || Compare(key_, seek_key) >= 0) {
+ return;
+ }
+ }
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+// inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+// this case, it points to the first key_ with a larger user_key or a
+// matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+// than the seek_user_key, or the block ends with a matching user_key but
+// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+// but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+ Slice user_key = ExtractUserKey(target);
+ uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+ uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key);
+
+ if (entry == kCollision) {
+ // HashSeek not effective, falling back
+ Seek(target);
+ return true;
+ }
+
+ if (entry == kNoEntry) {
+ // Even if we cannot find the user_key in this block, the result may
+ // exist in the next block. Consider this exmpale:
+ //
+ // Block N: [aab@100, ... , app@120]
+ // bounary key: axy@50 (we make minimal assumption about a boundary key)
+ // Block N+1: [axy@10, ... ]
+ //
+ // If seek_key = axy@60, the search will starts from Block N.
+ // Even if the user_key is not found in the hash map, the caller still
+ // have to conntinue searching the next block.
+ //
+ // In this case, we pretend the key is the the last restart interval.
+ // The while-loop below will search the last restart interval for the
+ // key. It will stop at the first key that is larger than the seek_key,
+ // or to the end of the block if no one is larger.
+ entry = static_cast<uint8_t>(num_restarts_ - 1);
+ }
+
+ uint32_t restart_index = entry;
+
+ // check if the key is in the restart_interval
+ assert(restart_index < num_restarts_);
+ SeekToRestartPoint(restart_index);
+
+ const char* limit = nullptr;
+ if (restart_index_ + 1 < num_restarts_) {
+ limit = data_ + GetRestartPoint(restart_index_ + 1);
+ } else {
+ limit = data_ + restarts_;
+ }
+
+ while (true) {
+ // Here we only linear seek the target key inside the restart interval.
+ // If a key does not exist inside a restart interval, we avoid
+ // further searching the block content accross restart interval boundary.
+ //
+ // TODO(fwu): check the left and write boundary of the restart interval
+ // to avoid linear seek a target key that is out of range.
+ if (!ParseNextDataKey(limit) || Compare(key_, target) >= 0) {
+ // we stop at the first potential matching user key.
+ break;
+ }
+ }
+
+ if (current_ == restarts_) {
+ // Search reaches to the end of the block. There are three possibilites:
+ // 1) there is only one user_key match in the block (otherwise collsion).
+ // the matching user_key resides in the last restart interval, and it
+ // is the last key of the restart interval and of the block as well.
+ // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+ //
+ // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+ // AND all existing user_keys in the restart interval are smaller than
+ // seek_user_key.
+ //
+ // 3) The seek_key is a false positive and happens to be hashed to the
+ // last restart interval, AND all existing user_keys in the restart
+ // interval are smaller than seek_user_key.
+ //
+ // The result may exist in the next block each case, so we return true.
+ return true;
+ }
+
+ if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) {
+ // the key is not in this block and cannot be at the next block either.
+ return false;
+ }
+
+ // Here we are conservative and only support a limited set of cases
+ ValueType value_type = ExtractValueType(key_.GetKey());
+ if (value_type != ValueType::kTypeValue &&
+ value_type != ValueType::kTypeDeletion &&
+ value_type != ValueType::kTypeSingleDeletion &&
+ value_type != ValueType::kTypeBlobIndex) {
+ Seek(target);
+ return true;
+ }
+
+ // Result found, and the iter is correctly set.
+ return true;
+}
+
+void IndexBlockIter::Seek(const Slice& target) {
+ Slice seek_key = target;
+ if (!key_includes_seq_) {
+ seek_key = ExtractUserKey(target);
+ }
PERF_TIMER_GUARD(block_seek_nanos);
if (data_ == nullptr) { // Not init yet
return;
bool ok = false;
if (prefix_index_) {
ok = PrefixSeek(target, &index);
+ } else if (value_delta_encoded_) {
+ ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
} else {
- ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
+ ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
}
if (!ok) {
// Linear search (within restart block) for first key >= target
while (true) {
- if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) {
+ if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) {
return;
}
}
}
-void BlockIter::SeekForPrev(const Slice& target) {
+void DataBlockIter::SeekForPrev(const Slice& target) {
PERF_TIMER_GUARD(block_seek_nanos);
+ Slice seek_key = target;
if (data_ == nullptr) { // Not init yet
return;
}
uint32_t index = 0;
- bool ok = false;
- ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
+ bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
if (!ok) {
return;
}
SeekToRestartPoint(index);
- // Linear search (within restart block) for first key >= target
+ // Linear search (within restart block) for first key >= seek_key
- while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) {
+ while (ParseNextDataKey() && Compare(key_, seek_key) < 0) {
}
if (!Valid()) {
SeekToLast();
} else {
- while (Valid() && Compare(key_.GetInternalKey(), target) > 0) {
+ while (Valid() && Compare(key_, seek_key) > 0) {
Prev();
}
}
}
-void BlockIter::SeekToFirst() {
+void DataBlockIter::SeekToFirst() {
if (data_ == nullptr) { // Not init yet
return;
}
SeekToRestartPoint(0);
- ParseNextKey();
+ ParseNextDataKey();
}
-void BlockIter::SeekToLast() {
+void IndexBlockIter::SeekToFirst() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ SeekToRestartPoint(0);
+ ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLast() {
if (data_ == nullptr) { // Not init yet
return;
}
SeekToRestartPoint(num_restarts_ - 1);
- while (ParseNextKey() && NextEntryOffset() < restarts_) {
+ while (ParseNextDataKey() && NextEntryOffset() < restarts_) {
// Keep skipping
}
}
-void BlockIter::CorruptionError() {
+void IndexBlockIter::SeekToLast() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ SeekToRestartPoint(num_restarts_ - 1);
+ while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
+ // Keep skipping
+ }
+}
+
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
value_.clear();
}
-bool BlockIter::ParseNextKey() {
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
current_ = NextEntryOffset();
const char* p = data_ + current_;
- const char* limit = data_ + restarts_; // Restarts come right after data
+ if (!limit) {
+ limit = data_ + restarts_; // Restarts come right after data
+ }
+
if (p >= limit) {
// No more entries to return. Mark as invalid.
current_ = restarts_;
// Decode next entry
uint32_t shared, non_shared, value_length;
- p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+ p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
if (p == nullptr || key_.Size() < shared) {
CorruptionError();
return false;
if (shared == 0) {
// If this key dont share any bytes with prev key then we dont need
// to decode it and can use it's address in the block directly.
- key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
+ key_.SetKey(Slice(p, non_shared), false /* copy */);
key_pinned_ = true;
} else {
// This key share `shared` bytes with prev key, we need to decode it
if (global_seqno_ != kDisableGlobalSequenceNumber) {
// If we are reading a file with a global sequence number we should
- // expect that all encoded sequence numbers are zeros and all value
- // types are kTypeValue
+ // expect that all encoded sequence numbers are zeros and any value
+ // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
- assert(ExtractValueType(key_.GetInternalKey()) == ValueType::kTypeValue);
+
+ ValueType value_type = ExtractValueType(key_.GetKey());
+ assert(value_type == ValueType::kTypeValue ||
+ value_type == ValueType::kTypeMerge ||
+ value_type == ValueType::kTypeDeletion ||
+ value_type == ValueType::kTypeRangeDeletion);
if (key_pinned_) {
// TODO(tec): Investigate updating the seqno in the loaded block
key_pinned_ = false;
}
- key_.UpdateInternalKey(global_seqno_, ValueType::kTypeValue);
+ key_.UpdateInternalKey(global_seqno_, value_type);
}
value_ = Slice(p + non_shared, value_length);
+ if (shared == 0) {
+ while (restart_index_ + 1 < num_restarts_ &&
+ GetRestartPoint(restart_index_ + 1) < current_) {
+ ++restart_index_;
+ }
+ }
+ // else we are in the middle of a restart interval and the restart_index_
+ // thus has not changed
+ return true;
+ }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+ current_ = NextEntryOffset();
+ const char* p = data_ + current_;
+ const char* limit = data_ + restarts_; // Restarts come right after data
+ if (p >= limit) {
+ // No more entries to return. Mark as invalid.
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return false;
+ }
+
+ // Decode next entry
+ uint32_t shared, non_shared, value_length;
+ if (value_delta_encoded_) {
+ p = DecodeKeyV4()(p, limit, &shared, &non_shared);
+ value_length = 0;
+ } else {
+ p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
+ }
+ if (p == nullptr || key_.Size() < shared) {
+ CorruptionError();
+ return false;
+ }
+ if (shared == 0) {
+ // If this key dont share any bytes with prev key then we dont need
+ // to decode it and can use it's address in the block directly.
+ key_.SetKey(Slice(p, non_shared), false /* copy */);
+ key_pinned_ = true;
+ } else {
+ // This key share `shared` bytes with prev key, we need to decode it
+ key_.TrimAppend(shared, p, non_shared);
+ key_pinned_ = false;
+ }
+ value_ = Slice(p + non_shared, value_length);
+ if (shared == 0) {
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
- return true;
+ }
+ // else we are in the middle of a restart interval and the restart_index_
+ // thus has not changed
+ if (value_delta_encoded_) {
+ assert(value_length == 0);
+ DecodeCurrentValue(shared);
+ }
+ return true;
+}
+
+// The format:
+// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block hanlde, is (offset, size) whenever the
+// shared_size is 0, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+ assert(value_delta_encoded_);
+ const char* limit = data_ + restarts_;
+ if (shared == 0) {
+ uint64_t o, s;
+ const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
+ assert(newp);
+ newp = GetVarint64Ptr(newp, limit, &s);
+ assert(newp);
+ decoded_value_ = BlockHandle(o, s);
+ value_ = Slice(value_.data(), newp - value_.data());
+ } else {
+ uint64_t next_value_base =
+ decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
+ int64_t delta;
+ const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
+ decoded_value_ =
+ BlockHandle(next_value_base, decoded_value_.size() + delta);
+ value_ = Slice(value_.data(), newp - value_.data());
}
}
// is either the last restart point with a key less than target,
// which means the key of next restart point is larger than target, or
// the first restart point with a key = target
-bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
- uint32_t* index) {
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
+ uint32_t right, uint32_t* index,
+ const Comparator* comp) {
assert(left <= right);
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
- uint32_t shared, non_shared, value_length;
- const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
- &shared, &non_shared, &value_length);
+ uint32_t shared, non_shared;
+ const char* key_ptr = DecodeKeyFunc()(
+ data_ + region_offset, data_ + restarts_, &shared, &non_shared);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return false;
}
Slice mid_key(key_ptr, non_shared);
- int cmp = Compare(mid_key, target);
+ int cmp = comp->Compare(mid_key, target);
if (cmp < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
// Compare target key and the block key of the block of `block_index`.
// Return -1 if error.
-int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
uint32_t region_offset = GetRestartPoint(block_index);
- uint32_t shared, non_shared, value_length;
- const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
- &shared, &non_shared, &value_length);
+ uint32_t shared, non_shared;
+ const char* key_ptr =
+ value_delta_encoded_
+ ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+ &non_shared)
+ : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+ &non_shared);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return 1; // Return target is smaller
// Binary search in block_ids to find the first block
// with a key >= target
-bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
- uint32_t left, uint32_t right,
- uint32_t* index) {
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+ uint32_t* block_ids, uint32_t left,
+ uint32_t right, uint32_t* index) {
assert(left <= right);
uint32_t left_bound = left;
}
}
-bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
assert(prefix_index_);
+ Slice seek_key = target;
+ if (!key_includes_seq_) {
+ seek_key = ExtractUserKey(target);
+ }
uint32_t* block_ids = nullptr;
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
current_ = restarts_;
return false;
} else {
- return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
+ return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index);
}
}
uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
- return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+ uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+ uint32_t num_restarts = block_footer;
+ if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+ // In BlockBuilder, we have ensured a block with HashIndex is less than
+ // kMaxBlockSizeSupportedByHashIndex (64KiB).
+ //
+ // Therefore, if we encounter a block with a size > 64KiB, the block
+ // cannot have HashIndex. So the footer will directly interpreted as
+ // num_restarts.
+ //
+ // Such check is for backward compatibility. We can ensure legacy block
+ // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+ // correctly as no HashIndex even if the MSB of num_restarts is set.
+ return num_restarts;
+ }
+ BlockBasedTableOptions::DataBlockIndexType index_type;
+ UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+ return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+ assert(size_ >= 2 * sizeof(uint32_t));
+ if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+ // The check is for the same reason as that in NumRestarts()
+ return BlockBasedTableOptions::kDataBlockBinarySearch;
+ }
+ uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+ uint32_t num_restarts = block_footer;
+ BlockBasedTableOptions::DataBlockIndexType index_type;
+ UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+ return index_type;
+}
+
+Block::~Block() {
+ // This sync point can be re-enabled if RocksDB can control the
+ // initialization order of any/all static options created by the user.
+ // TEST_SYNC_POINT("Block::~Block");
}
Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
: contents_(std::move(contents)),
data_(contents_.data.data()),
size_(contents_.data.size()),
+ restart_offset_(0),
+ num_restarts_(0),
global_seqno_(_global_seqno) {
+ TEST_SYNC_POINT("Block::Block:0");
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
- restart_offset_ =
- static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
- if (restart_offset_ > size_ - sizeof(uint32_t)) {
- // The size is too small for NumRestarts() and therefore
- // restart_offset_ wrapped around.
- size_ = 0;
+ // Should only decode restart points for uncompressed blocks
+ if (compression_type() == kNoCompression) {
+ num_restarts_ = NumRestarts();
+ switch (IndexType()) {
+ case BlockBasedTableOptions::kDataBlockBinarySearch:
+ restart_offset_ = static_cast<uint32_t>(size_) -
+ (1 + num_restarts_) * sizeof(uint32_t);
+ if (restart_offset_ > size_ - sizeof(uint32_t)) {
+ // The size is too small for NumRestarts() and therefore
+ // restart_offset_ wrapped around.
+ size_ = 0;
+ }
+ break;
+ case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+ if (size_ < sizeof(uint32_t) /* block footer */ +
+ sizeof(uint16_t) /* NUM_BUCK */) {
+ size_ = 0;
+ break;
+ }
+
+ uint16_t map_offset;
+ data_block_hash_index_.Initialize(
+ contents.data.data(),
+ static_cast<uint16_t>(contents.data.size() -
+ sizeof(uint32_t)), /*chop off
+ NUM_RESTARTS*/
+ &map_offset);
+
+ restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+ if (restart_offset_ > map_offset) {
+ // map_offset is too small for NumRestarts() and
+ // therefore restart_offset_ wrapped around.
+ size_ = 0;
+ break;
+ }
+ break;
+ default:
+ size_ = 0; // Error marker
+ }
}
}
if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
}
}
-InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
- bool total_order_seek, Statistics* stats) {
- if (size_ < 2*sizeof(uint32_t)) {
- if (iter != nullptr) {
- iter->SetStatus(Status::Corruption("bad block contents"));
- return iter;
- } else {
- return NewErrorInternalIterator(Status::Corruption("bad block contents"));
- }
+template <>
+DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
+ DataBlockIter* iter, Statistics* stats,
+ bool /*total_order_seek*/,
+ bool /*key_includes_seq*/,
+ bool /*value_is_full*/,
+ BlockPrefixIndex* /*prefix_index*/) {
+ DataBlockIter* ret_iter;
+ if (iter != nullptr) {
+ ret_iter = iter;
+ } else {
+ ret_iter = new DataBlockIter;
}
- const uint32_t num_restarts = NumRestarts();
- if (num_restarts == 0) {
- if (iter != nullptr) {
- iter->SetStatus(Status::OK());
- return iter;
- } else {
- return NewEmptyInternalIterator();
- }
+ if (size_ < 2 * sizeof(uint32_t)) {
+ ret_iter->Invalidate(Status::Corruption("bad block contents"));
+ return ret_iter;
+ }
+ if (num_restarts_ == 0) {
+ // Empty block.
+ ret_iter->Invalidate(Status::OK());
+ return ret_iter;
} else {
- BlockPrefixIndex* prefix_index_ptr =
- total_order_seek ? nullptr : prefix_index_.get();
-
- if (iter != nullptr) {
- iter->Initialize(cmp, data_, restart_offset_, num_restarts,
- prefix_index_ptr, global_seqno_, read_amp_bitmap_.get());
- } else {
- iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
- prefix_index_ptr, global_seqno_,
- read_amp_bitmap_.get());
- }
-
+ ret_iter->Initialize(
+ cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+ read_amp_bitmap_.get(), cachable(),
+ data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) {
// DB changed the Statistics pointer, we need to notify read_amp_bitmap_
}
}
- return iter;
+ return ret_iter;
}
-void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
- prefix_index_.reset(prefix_index);
+template <>
+IndexBlockIter* Block::NewIterator(const Comparator* cmp,
+ const Comparator* ucmp, IndexBlockIter* iter,
+ Statistics* /*stats*/, bool total_order_seek,
+ bool key_includes_seq, bool value_is_full,
+ BlockPrefixIndex* prefix_index) {
+ IndexBlockIter* ret_iter;
+ if (iter != nullptr) {
+ ret_iter = iter;
+ } else {
+ ret_iter = new IndexBlockIter;
+ }
+ if (size_ < 2 * sizeof(uint32_t)) {
+ ret_iter->Invalidate(Status::Corruption("bad block contents"));
+ return ret_iter;
+ }
+ if (num_restarts_ == 0) {
+ // Empty block.
+ ret_iter->Invalidate(Status::OK());
+ return ret_iter;
+ } else {
+ BlockPrefixIndex* prefix_index_ptr =
+ total_order_seek ? nullptr : prefix_index;
+ ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
+ prefix_index_ptr, key_includes_seq, value_is_full,
+ cachable(), nullptr /* data_block_hash_index */);
+ }
+
+ return ret_iter;
}
size_t Block::ApproximateMemoryUsage() const {
size_t usage = usable_size();
- if (prefix_index_) {
- usage += prefix_index_->ApproximateMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size((void*)this);
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ if (read_amp_bitmap_) {
+ usage += read_amp_bitmap_->ApproximateMemoryUsage();
}
return usage;
}