// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
#include "db/dbformat.h"
#include "db/pinned_iterators_manager.h"
+#include "format.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
#include "table/block_prefix_index.h"
+#include "table/data_block_hash_index.h"
#include "table/internal_iterator.h"
-
-#include "format.h"
+#include "util/random.h"
+#include "util/sync_point.h"
namespace rocksdb {
struct BlockContents;
class Comparator;
+template <class TValue>
class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
class BlockPrefixIndex;
// BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to
public:
explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
Statistics* statistics)
- : bitmap_(nullptr), bytes_per_bit_pow_(0), statistics_(statistics) {
+ : bitmap_(nullptr),
+ bytes_per_bit_pow_(0),
+ statistics_(statistics),
+ rnd_(
+ Random::GetTLSInstance()->Uniform(static_cast<int>(bytes_per_bit))) {
+ TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
assert(block_size > 0 && bytes_per_bit > 0);
// convert bytes_per_bit to be a power of 2
// num_bits_needed = ceil(block_size / bytes_per_bit)
size_t num_bits_needed =
- (block_size >> static_cast<size_t>(bytes_per_bit_pow_)) +
- (block_size % (static_cast<size_t>(1)
- << static_cast<size_t>(bytes_per_bit_pow_)) !=
- 0);
+ ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+ assert(num_bits_needed > 0);
// bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
- size_t bitmap_size = (num_bits_needed / kBitsPerEntry) +
- (num_bits_needed % kBitsPerEntry != 0);
+ size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
// Create bitmap and set all the bits to 0
- bitmap_ = new std::atomic<uint32_t>[bitmap_size];
- memset(bitmap_, 0, bitmap_size * kBytesPersEntry);
+ bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
- RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES,
- num_bits_needed << bytes_per_bit_pow_);
+ RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
}
~BlockReadAmpBitmap() { delete[] bitmap_; }
void Mark(uint32_t start_offset, uint32_t end_offset) {
assert(end_offset >= start_offset);
-
- // Every new bit we set will bump this counter
- uint32_t new_useful_bytes = 0;
- // Index of first bit in mask (start_offset / bytes_per_bit)
- uint32_t start_bit = start_offset >> bytes_per_bit_pow_;
- // Index of last bit in mask (end_offset / bytes_per_bit)
- uint32_t end_bit = end_offset >> bytes_per_bit_pow_;
- // Index of middle bit (unique to this range)
- uint32_t mid_bit = start_bit + 1;
-
- // It's guaranteed that ranges sent to Mark() wont overlap, this mean that
- // we dont need to set the middle bits, we can simply set only one bit of
- // the middle bits, and check this bit if we want to know if the whole
- // range is set or not.
- if (mid_bit < end_bit) {
- if (GetAndSet(mid_bit) == 0) {
- new_useful_bytes += (end_bit - mid_bit) << bytes_per_bit_pow_;
- } else {
- // If the middle bit is set, it's guaranteed that start and end bits
- // are also set
- return;
- }
- } else {
- // This range dont have a middle bit, the whole range fall in 1 or 2 bits
+ // Index of first bit in mask
+ uint32_t start_bit =
+ (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+ bytes_per_bit_pow_;
+ // Index of last bit in mask + 1
+ uint32_t exclusive_end_bit =
+ (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+ if (start_bit >= exclusive_end_bit) {
+ return;
}
+ assert(exclusive_end_bit > 0);
if (GetAndSet(start_bit) == 0) {
- new_useful_bytes += (1 << bytes_per_bit_pow_);
- }
-
- if (GetAndSet(end_bit) == 0) {
- new_useful_bytes += (1 << bytes_per_bit_pow_);
- }
-
- if (new_useful_bytes > 0) {
+ uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+ << bytes_per_bit_pow_;
RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
new_useful_bytes);
}
uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+ size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size((void*)this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return sizeof(*this);
+ }
+
private:
// Get the current value of bit at `bit_idx` and set it to 1
inline bool GetAndSet(uint32_t bit_idx) {
// this pointer maybe invalid, but the DB will update it to a valid pointer
// by using SetStatistics() before calling Mark()
std::atomic<Statistics*> statistics_;
+ uint32_t rnd_;
};
class Block {
size_t read_amp_bytes_per_bit = 0,
Statistics* statistics = nullptr);
- ~Block() = default;
+ ~Block();
size_t size() const { return size_; }
const char* data() const { return data_; }
bool cachable() const { return contents_.cachable; }
- size_t usable_size() const {
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
- if (contents_.allocation.get() != nullptr) {
- return malloc_usable_size(contents_.allocation.get());
- }
-#endif // ROCKSDB_MALLOC_USABLE_SIZE
- return size_;
- }
+ // The additional memory space taken by the block data.
+ size_t usable_size() const { return contents_.usable_size(); }
uint32_t NumRestarts() const;
+ BlockBasedTableOptions::DataBlockIndexType IndexType() const;
CompressionType compression_type() const {
return contents_.compression_type;
}
- // If hash index lookup is enabled and `use_hash_index` is true. This block
- // will do hash lookup for the key prefix.
- //
- // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
- // the iterator will simply be set as "invalid", rather than returning
- // the key that is just pass the target key.
+ // If comparator is InternalKeyComparator, user_comparator is its user
+ // comparator; they are equal otherwise.
//
// If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator*
//
- // If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
- // This option only applies for index block. For data block, hash_index_
- // and prefix_index_ are null, so this option does not matter.
- InternalIterator* NewIterator(const Comparator* comparator,
- BlockIter* iter = nullptr,
- bool total_order_seek = true,
- Statistics* stats = nullptr);
- void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default ture, means that no delta encoding is
+ // applied to values.
+ //
+ // NewIterator<DataBlockIter>
+ // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
+ //
+ // NewIterator<IndexBlockIter>
+ // If `prefix_index` is not nullptr this block will do hash lookup for the key
+ // prefix. If total_order_seek is true, prefix_index_ is ignored.
+ //
+ // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+ // the iterator will simply be set as "invalid", rather than returning
+ // the key that is just pass the target key.
+ template <typename TBlockIter>
+ TBlockIter* NewIterator(
+ const Comparator* comparator, const Comparator* user_comparator,
+ TBlockIter* iter = nullptr, Statistics* stats = nullptr,
+ bool total_order_seek = true, bool key_includes_seq = true,
+ bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr);
// Report an approximation of how much memory has been used.
size_t ApproximateMemoryUsage() const;
const char* data_; // contents_.data.data()
size_t size_; // contents_.data.size()
uint32_t restart_offset_; // Offset in data_ of restart array
- std::unique_ptr<BlockPrefixIndex> prefix_index_;
+ uint32_t num_restarts_;
std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
// All keys in the block will have seqno = global_seqno_, regardless of
// the encoded value (kDisableGlobalSequenceNumber means disabled)
const SequenceNumber global_seqno_;
+ DataBlockHashIndex data_block_hash_index_;
+
// No copying allowed
- Block(const Block&);
- void operator=(const Block&);
+ Block(const Block&) = delete;
+ void operator=(const Block&) = delete;
};
-class BlockIter : public InternalIterator {
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
public:
- BlockIter()
- : comparator_(nullptr),
- data_(nullptr),
- restarts_(0),
- num_restarts_(0),
- current_(0),
- restart_index_(0),
- status_(Status::OK()),
- prefix_index_(nullptr),
- key_pinned_(false),
- global_seqno_(kDisableGlobalSequenceNumber),
- read_amp_bitmap_(nullptr),
- last_bitmap_offset_(0) {}
-
- BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
- uint32_t num_restarts, BlockPrefixIndex* prefix_index,
- SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap)
- : BlockIter() {
- Initialize(comparator, data, restarts, num_restarts, prefix_index,
- global_seqno, read_amp_bitmap);
- }
-
- void Initialize(const Comparator* comparator, const char* data,
- uint32_t restarts, uint32_t num_restarts,
- BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
- BlockReadAmpBitmap* read_amp_bitmap) {
+ void InitializeBase(const Comparator* comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno, bool block_contents_pinned) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
num_restarts_ = num_restarts;
current_ = restarts_;
restart_index_ = num_restarts_;
- prefix_index_ = prefix_index;
global_seqno_ = global_seqno;
- read_amp_bitmap_ = read_amp_bitmap;
- last_bitmap_offset_ = current_ + 1;
+ block_contents_pinned_ = block_contents_pinned;
}
- void SetStatus(Status s) {
+ // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+ // nothing. Calls cleanup functions.
+ void InvalidateBase(Status s) {
+ // Assert that the BlockIter is never deleted while Pinning is Enabled.
+ assert(!pinned_iters_mgr_ ||
+ (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+
+ data_ = nullptr;
+ current_ = restarts_;
status_ = s;
+
+ // Call cleanup callbacks.
+ Cleanable::Reset();
}
virtual bool Valid() const override { return current_ < restarts_; }
virtual Status status() const override { return status_; }
virtual Slice key() const override {
assert(Valid());
- return key_.GetInternalKey();
+ return key_.GetKey();
}
- virtual Slice value() const override {
- assert(Valid());
- if (read_amp_bitmap_ && current_ < restarts_ &&
- current_ != last_bitmap_offset_) {
- read_amp_bitmap_->Mark(current_ /* current entry offset */,
- NextEntryOffset() - 1);
- last_bitmap_offset_ = current_;
- }
- return value_;
- }
-
- virtual void Next() override;
-
- virtual void Prev() override;
-
- virtual void Seek(const Slice& target) override;
-
- virtual void SeekForPrev(const Slice& target) override;
-
- virtual void SeekToFirst() override;
-
- virtual void SeekToLast() override;
#ifndef NDEBUG
- ~BlockIter() {
+ virtual ~BlockIter() {
// Assert that the BlockIter is never deleted while Pinning is Enabled.
assert(!pinned_iters_mgr_ ||
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif
- virtual bool IsKeyPinned() const override { return key_pinned_; }
+ virtual bool IsKeyPinned() const override {
+ return block_contents_pinned_ && key_pinned_;
+ }
- virtual bool IsValuePinned() const override { return true; }
+ virtual bool IsValuePinned() const override { return block_contents_pinned_; }
size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
return static_cast<uint32_t>(value_.data() - data_);
}
- private:
+ protected:
+ // Note: The type could be changed to InternalKeyComparator but we see a weird
+ // performance drop by that.
const Comparator* comparator_;
const char* data_; // underlying block contents
- uint32_t restarts_; // Offset of restart array (list of fixed32)
uint32_t num_restarts_; // Number of uint32_t entries in restart array
+ // Index of restart block in which current_ or current_-1 falls
+ uint32_t restart_index_;
+ uint32_t restarts_; // Offset of restart array (list of fixed32)
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
- uint32_t restart_index_; // Index of restart block in which current_ falls
IterKey key_;
Slice value_;
Status status_;
- BlockPrefixIndex* prefix_index_;
bool key_pinned_;
+ // whether the block data is guaranteed to outlive this iterator
+ bool block_contents_pinned_;
SequenceNumber global_seqno_;
+ public:
+ // Return the offset in data_ just past the end of the current entry.
+ inline uint32_t NextEntryOffset() const {
+ // NOTE: We don't support blocks bigger than 2GB
+ return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+ }
+
+ uint32_t GetRestartPoint(uint32_t index) {
+ assert(index < num_restarts_);
+ return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+ }
+
+ void SeekToRestartPoint(uint32_t index) {
+ key_.Clear();
+ restart_index_ = index;
+ // current_ will be fixed by ParseNextKey();
+
+ // ParseNextKey() starts at the end of value_, so set value_ accordingly
+ uint32_t offset = GetRestartPoint(index);
+ value_ = Slice(data_ + offset, 0);
+ }
+
+ void CorruptionError();
+
+ template <typename DecodeKeyFunc>
+ inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+ uint32_t* index, const Comparator* comp);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+ DataBlockIter()
+ : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+ DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
+ const char* data, uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index)
+ : DataBlockIter() {
+ Initialize(comparator, user_comparator, data, restarts, num_restarts,
+ global_seqno, read_amp_bitmap, block_contents_pinned,
+ data_block_hash_index);
+ }
+ void Initialize(const Comparator* comparator,
+ const Comparator* user_comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap,
+ bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index) {
+ InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
+ block_contents_pinned);
+ user_comparator_ = user_comparator;
+ key_.SetIsUserKey(false);
+ read_amp_bitmap_ = read_amp_bitmap;
+ last_bitmap_offset_ = current_ + 1;
+ data_block_hash_index_ = data_block_hash_index;
+ }
+
+ virtual Slice value() const override {
+ assert(Valid());
+ if (read_amp_bitmap_ && current_ < restarts_ &&
+ current_ != last_bitmap_offset_) {
+ read_amp_bitmap_->Mark(current_ /* current entry offset */,
+ NextEntryOffset() - 1);
+ last_bitmap_offset_ = current_;
+ }
+ return value_;
+ }
+
+ virtual void Seek(const Slice& target) override;
+
+ inline bool SeekForGet(const Slice& target) {
+ if (!data_block_hash_index_) {
+ Seek(target);
+ return true;
+ }
+
+ return SeekForGetImpl(target);
+ }
+
+ virtual void SeekForPrev(const Slice& target) override;
+
+ virtual void Prev() override;
+
+ virtual void Next() override;
+
+ virtual void SeekToFirst() override;
+
+ virtual void SeekToLast() override;
+
+ void Invalidate(Status s) {
+ InvalidateBase(s);
+ // Clear prev entries cache.
+ prev_entries_keys_buff_.clear();
+ prev_entries_.clear();
+ prev_entries_idx_ = -1;
+ }
+
+ private:
// read-amp bitmap
BlockReadAmpBitmap* read_amp_bitmap_;
// last `current_` value we report to read-amp bitmp
mutable uint32_t last_bitmap_offset_;
-
struct CachedPrevEntry {
explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
size_t _key_offset, size_t _key_size, Slice _value)
std::vector<CachedPrevEntry> prev_entries_;
int32_t prev_entries_idx_ = -1;
- inline int Compare(const Slice& a, const Slice& b) const {
- return comparator_->Compare(a, b);
+ DataBlockHashIndex* data_block_hash_index_;
+ const Comparator* user_comparator_;
+
+ inline bool ParseNextDataKey(const char* limit = nullptr);
+
+ inline int Compare(const IterKey& ikey, const Slice& b) const {
+ return comparator_->Compare(ikey.GetInternalKey(), b);
}
- // Return the offset in data_ just past the end of the current entry.
- inline uint32_t NextEntryOffset() const {
- // NOTE: We don't support blocks bigger than 2GB
- return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+ bool SeekForGetImpl(const Slice& target);
+};
+
+class IndexBlockIter final : public BlockIter<BlockHandle> {
+ public:
+ IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+ virtual Slice key() const override {
+ assert(Valid());
+ return key_.GetKey();
+ }
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default ture, means that no delta encoding is
+ // applied to values.
+ IndexBlockIter(const Comparator* comparator,
+ const Comparator* user_comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ BlockPrefixIndex* prefix_index, bool key_includes_seq,
+ bool value_is_full, bool block_contents_pinned)
+ : IndexBlockIter() {
+ Initialize(comparator, user_comparator, data, restarts, num_restarts,
+ prefix_index, key_includes_seq, block_contents_pinned,
+ value_is_full, nullptr /* data_block_hash_index */);
}
- uint32_t GetRestartPoint(uint32_t index) {
- assert(index < num_restarts_);
- return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+ void Initialize(const Comparator* comparator,
+ const Comparator* user_comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ BlockPrefixIndex* prefix_index, bool key_includes_seq,
+ bool value_is_full, bool block_contents_pinned,
+ DataBlockHashIndex* /*data_block_hash_index*/) {
+ InitializeBase(key_includes_seq ? comparator : user_comparator, data,
+ restarts, num_restarts, kDisableGlobalSequenceNumber,
+ block_contents_pinned);
+ key_includes_seq_ = key_includes_seq;
+ key_.SetIsUserKey(!key_includes_seq_);
+ prefix_index_ = prefix_index;
+ value_delta_encoded_ = !value_is_full;
}
- void SeekToRestartPoint(uint32_t index) {
- key_.Clear();
- restart_index_ = index;
- // current_ will be fixed by ParseNextKey();
+ virtual BlockHandle value() const override {
+ assert(Valid());
+ if (value_delta_encoded_) {
+ return decoded_value_;
+ } else {
+ BlockHandle handle;
+ Slice v = value_;
+ Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
+ assert(decode_s.ok());
+ return handle;
+ }
+ }
- // ParseNextKey() starts at the end of value_, so set value_ accordingly
- uint32_t offset = GetRestartPoint(index);
- value_ = Slice(data_ + offset, 0);
+ virtual void Seek(const Slice& target) override;
+
+ virtual void SeekForPrev(const Slice&) override {
+ assert(false);
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::InvalidArgument(
+ "RocksDB internal error: should never call SeekForPrev() on index "
+ "blocks");
+ key_.Clear();
+ value_.clear();
}
- void CorruptionError();
+ virtual void Prev() override;
+
+ virtual void Next() override;
+
+ virtual void SeekToFirst() override;
- bool ParseNextKey();
+ virtual void SeekToLast() override;
- bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
- uint32_t* index);
+ void Invalidate(Status s) { InvalidateBase(s); }
- int CompareBlockKey(uint32_t block_index, const Slice& target);
+ private:
+ // Key is in InternalKey format
+ bool key_includes_seq_;
+ bool value_delta_encoded_;
+ BlockPrefixIndex* prefix_index_;
+ // Whether the value is delta encoded. In that case the value is assumed to be
+ // BlockHandle. The first value in each restart interval is the full encoded
+ // BlockHandle; the restart of encoded size part of the BlockHandle. The
+ // offset of delta encoded BlockHandles is computed by adding the size of
+ // previous delta encoded values in the same restart interval to the offset of
+ // the first value in that restart interval.
+ BlockHandle decoded_value_;
+ bool PrefixSeek(const Slice& target, uint32_t* index);
bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
uint32_t left, uint32_t right,
uint32_t* index);
+ inline int CompareBlockKey(uint32_t block_index, const Slice& target);
- bool PrefixSeek(const Slice& target, uint32_t* index);
+ inline int Compare(const Slice& a, const Slice& b) const {
+ return comparator_->Compare(a, b);
+ }
+
+ inline int Compare(const IterKey& ikey, const Slice& b) const {
+ return comparator_->Compare(ikey.GetKey(), b);
+ }
+
+ inline bool ParseNextIndexKey();
+ // When value_delta_encoded_ is enabled it decodes the value which is assumed
+ // to be BlockHandle and put it to decoded_value_
+ inline void DecodeCurrentValue(uint32_t shared);
};
} // namespace rocksdb