1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
15 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
17 #include <malloc_np.h>
23 #include "db/dbformat.h"
24 #include "db/pinned_iterators_manager.h"
26 #include "rocksdb/iterator.h"
27 #include "rocksdb/options.h"
28 #include "rocksdb/statistics.h"
29 #include "rocksdb/table.h"
30 #include "table/block_prefix_index.h"
31 #include "table/data_block_hash_index.h"
32 #include "table/internal_iterator.h"
33 #include "util/random.h"
34 #include "util/sync_point.h"
40 template <class TValue
>
44 class BlockPrefixIndex
;
46 // BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to
47 // a bitmap with ratio bytes_per_bit. Whenever we access a range of bytes in
48 // the Block we update the bitmap and increment READ_AMP_ESTIMATE_USEFUL_BYTES.
49 class BlockReadAmpBitmap
{
51 explicit BlockReadAmpBitmap(size_t block_size
, size_t bytes_per_bit
,
52 Statistics
* statistics
)
54 bytes_per_bit_pow_(0),
55 statistics_(statistics
),
57 Random::GetTLSInstance()->Uniform(static_cast<int>(bytes_per_bit
))) {
58 TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_
);
59 assert(block_size
> 0 && bytes_per_bit
> 0);
61 // convert bytes_per_bit to be a power of 2
62 while (bytes_per_bit
>>= 1) {
66 // num_bits_needed = ceil(block_size / bytes_per_bit)
67 size_t num_bits_needed
=
68 ((block_size
- 1) >> bytes_per_bit_pow_
) + 1;
69 assert(num_bits_needed
> 0);
71 // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
72 size_t bitmap_size
= (num_bits_needed
- 1) / kBitsPerEntry
+ 1;
74 // Create bitmap and set all the bits to 0
75 bitmap_
= new std::atomic
<uint32_t>[bitmap_size
]();
77 RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES
, block_size
);
80 ~BlockReadAmpBitmap() { delete[] bitmap_
; }
82 void Mark(uint32_t start_offset
, uint32_t end_offset
) {
83 assert(end_offset
>= start_offset
);
84 // Index of first bit in mask
86 (start_offset
+ (1 << bytes_per_bit_pow_
) - rnd_
- 1) >>
88 // Index of last bit in mask + 1
89 uint32_t exclusive_end_bit
=
90 (end_offset
+ (1 << bytes_per_bit_pow_
) - rnd_
) >> bytes_per_bit_pow_
;
91 if (start_bit
>= exclusive_end_bit
) {
94 assert(exclusive_end_bit
> 0);
96 if (GetAndSet(start_bit
) == 0) {
97 uint32_t new_useful_bytes
= (exclusive_end_bit
- start_bit
)
98 << bytes_per_bit_pow_
;
99 RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES
,
104 Statistics
* GetStatistics() {
105 return statistics_
.load(std::memory_order_relaxed
);
108 void SetStatistics(Statistics
* stats
) { statistics_
.store(stats
); }
110 uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_
; }
112 size_t ApproximateMemoryUsage() const {
113 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
114 return malloc_usable_size((void*)this);
115 #endif // ROCKSDB_MALLOC_USABLE_SIZE
116 return sizeof(*this);
120 // Get the current value of bit at `bit_idx` and set it to 1
121 inline bool GetAndSet(uint32_t bit_idx
) {
122 const uint32_t byte_idx
= bit_idx
/ kBitsPerEntry
;
123 const uint32_t bit_mask
= 1 << (bit_idx
% kBitsPerEntry
);
125 return bitmap_
[byte_idx
].fetch_or(bit_mask
, std::memory_order_relaxed
) &
129 const uint32_t kBytesPersEntry
= sizeof(uint32_t); // 4 bytes
130 const uint32_t kBitsPerEntry
= kBytesPersEntry
* 8; // 32 bits
132 // Bitmap used to record the bytes that we read, use atomic to protect
133 // against multiple threads updating the same bit
134 std::atomic
<uint32_t>* bitmap_
;
135 // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
136 // muliplication and division
137 uint8_t bytes_per_bit_pow_
;
138 // Pointer to DB Statistics object, Since this bitmap may outlive the DB
139 // this pointer maybe invalid, but the DB will update it to a valid pointer
140 // by using SetStatistics() before calling Mark()
141 std::atomic
<Statistics
*> statistics_
;
147 // Initialize the block with the specified contents.
148 explicit Block(BlockContents
&& contents
, SequenceNumber _global_seqno
,
149 size_t read_amp_bytes_per_bit
= 0,
150 Statistics
* statistics
= nullptr);
154 size_t size() const { return size_
; }
155 const char* data() const { return data_
; }
156 bool cachable() const { return contents_
.cachable
; }
157 // The additional memory space taken by the block data.
158 size_t usable_size() const { return contents_
.usable_size(); }
159 uint32_t NumRestarts() const;
160 BlockBasedTableOptions::DataBlockIndexType
IndexType() const;
161 CompressionType
compression_type() const {
162 return contents_
.compression_type
;
165 // If comparator is InternalKeyComparator, user_comparator is its user
166 // comparator; they are equal otherwise.
168 // If iter is null, return new Iterator
169 // If iter is not null, update this one and return it as Iterator*
171 // key_includes_seq, default true, means that the keys are in internal key
173 // value_is_full, default ture, means that no delta encoding is
174 // applied to values.
176 // NewIterator<DataBlockIter>
177 // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
179 // NewIterator<IndexBlockIter>
180 // If `prefix_index` is not nullptr this block will do hash lookup for the key
181 // prefix. If total_order_seek is true, prefix_index_ is ignored.
183 // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
184 // the iterator will simply be set as "invalid", rather than returning
185 // the key that is just pass the target key.
186 template <typename TBlockIter
>
187 TBlockIter
* NewIterator(
188 const Comparator
* comparator
, const Comparator
* user_comparator
,
189 TBlockIter
* iter
= nullptr, Statistics
* stats
= nullptr,
190 bool total_order_seek
= true, bool key_includes_seq
= true,
191 bool value_is_full
= true, BlockPrefixIndex
* prefix_index
= nullptr);
193 // Report an approximation of how much memory has been used.
194 size_t ApproximateMemoryUsage() const;
196 SequenceNumber
global_seqno() const { return global_seqno_
; }
199 BlockContents contents_
;
200 const char* data_
; // contents_.data.data()
201 size_t size_
; // contents_.data.size()
202 uint32_t restart_offset_
; // Offset in data_ of restart array
203 uint32_t num_restarts_
;
204 std::unique_ptr
<BlockReadAmpBitmap
> read_amp_bitmap_
;
205 // All keys in the block will have seqno = global_seqno_, regardless of
206 // the encoded value (kDisableGlobalSequenceNumber means disabled)
207 const SequenceNumber global_seqno_
;
209 DataBlockHashIndex data_block_hash_index_
;
211 // No copying allowed
212 Block(const Block
&) = delete;
213 void operator=(const Block
&) = delete;
216 template <class TValue
>
217 class BlockIter
: public InternalIteratorBase
<TValue
> {
219 void InitializeBase(const Comparator
* comparator
, const char* data
,
220 uint32_t restarts
, uint32_t num_restarts
,
221 SequenceNumber global_seqno
, bool block_contents_pinned
) {
222 assert(data_
== nullptr); // Ensure it is called only once
223 assert(num_restarts
> 0); // Ensure the param is valid
225 comparator_
= comparator
;
227 restarts_
= restarts
;
228 num_restarts_
= num_restarts
;
229 current_
= restarts_
;
230 restart_index_
= num_restarts_
;
231 global_seqno_
= global_seqno
;
232 block_contents_pinned_
= block_contents_pinned
;
235 // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
236 // nothing. Calls cleanup functions.
237 void InvalidateBase(Status s
) {
238 // Assert that the BlockIter is never deleted while Pinning is Enabled.
239 assert(!pinned_iters_mgr_
||
240 (pinned_iters_mgr_
&& !pinned_iters_mgr_
->PinningEnabled()));
243 current_
= restarts_
;
246 // Call cleanup callbacks.
250 virtual bool Valid() const override
{ return current_
< restarts_
; }
251 virtual Status
status() const override
{ return status_
; }
252 virtual Slice
key() const override
{
254 return key_
.GetKey();
258 virtual ~BlockIter() {
259 // Assert that the BlockIter is never deleted while Pinning is Enabled.
260 assert(!pinned_iters_mgr_
||
261 (pinned_iters_mgr_
&& !pinned_iters_mgr_
->PinningEnabled()));
263 virtual void SetPinnedItersMgr(
264 PinnedIteratorsManager
* pinned_iters_mgr
) override
{
265 pinned_iters_mgr_
= pinned_iters_mgr
;
267 PinnedIteratorsManager
* pinned_iters_mgr_
= nullptr;
270 virtual bool IsKeyPinned() const override
{
271 return block_contents_pinned_
&& key_pinned_
;
274 virtual bool IsValuePinned() const override
{ return block_contents_pinned_
; }
276 size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_
; }
278 uint32_t ValueOffset() const {
279 return static_cast<uint32_t>(value_
.data() - data_
);
283 // Note: The type could be changed to InternalKeyComparator but we see a weird
284 // performance drop by that.
285 const Comparator
* comparator_
;
286 const char* data_
; // underlying block contents
287 uint32_t num_restarts_
; // Number of uint32_t entries in restart array
289 // Index of restart block in which current_ or current_-1 falls
290 uint32_t restart_index_
;
291 uint32_t restarts_
; // Offset of restart array (list of fixed32)
292 // current_ is offset in data_ of current entry. >= restarts_ if !Valid
298 // whether the block data is guaranteed to outlive this iterator
299 bool block_contents_pinned_
;
300 SequenceNumber global_seqno_
;
303 // Return the offset in data_ just past the end of the current entry.
304 inline uint32_t NextEntryOffset() const {
305 // NOTE: We don't support blocks bigger than 2GB
306 return static_cast<uint32_t>((value_
.data() + value_
.size()) - data_
);
309 uint32_t GetRestartPoint(uint32_t index
) {
310 assert(index
< num_restarts_
);
311 return DecodeFixed32(data_
+ restarts_
+ index
* sizeof(uint32_t));
314 void SeekToRestartPoint(uint32_t index
) {
316 restart_index_
= index
;
317 // current_ will be fixed by ParseNextKey();
319 // ParseNextKey() starts at the end of value_, so set value_ accordingly
320 uint32_t offset
= GetRestartPoint(index
);
321 value_
= Slice(data_
+ offset
, 0);
324 void CorruptionError();
326 template <typename DecodeKeyFunc
>
327 inline bool BinarySeek(const Slice
& target
, uint32_t left
, uint32_t right
,
328 uint32_t* index
, const Comparator
* comp
);
331 class DataBlockIter final
: public BlockIter
<Slice
> {
334 : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
335 DataBlockIter(const Comparator
* comparator
, const Comparator
* user_comparator
,
336 const char* data
, uint32_t restarts
, uint32_t num_restarts
,
337 SequenceNumber global_seqno
,
338 BlockReadAmpBitmap
* read_amp_bitmap
, bool block_contents_pinned
,
339 DataBlockHashIndex
* data_block_hash_index
)
341 Initialize(comparator
, user_comparator
, data
, restarts
, num_restarts
,
342 global_seqno
, read_amp_bitmap
, block_contents_pinned
,
343 data_block_hash_index
);
345 void Initialize(const Comparator
* comparator
,
346 const Comparator
* user_comparator
, const char* data
,
347 uint32_t restarts
, uint32_t num_restarts
,
348 SequenceNumber global_seqno
,
349 BlockReadAmpBitmap
* read_amp_bitmap
,
350 bool block_contents_pinned
,
351 DataBlockHashIndex
* data_block_hash_index
) {
352 InitializeBase(comparator
, data
, restarts
, num_restarts
, global_seqno
,
353 block_contents_pinned
);
354 user_comparator_
= user_comparator
;
355 key_
.SetIsUserKey(false);
356 read_amp_bitmap_
= read_amp_bitmap
;
357 last_bitmap_offset_
= current_
+ 1;
358 data_block_hash_index_
= data_block_hash_index
;
361 virtual Slice
value() const override
{
363 if (read_amp_bitmap_
&& current_
< restarts_
&&
364 current_
!= last_bitmap_offset_
) {
365 read_amp_bitmap_
->Mark(current_
/* current entry offset */,
366 NextEntryOffset() - 1);
367 last_bitmap_offset_
= current_
;
372 virtual void Seek(const Slice
& target
) override
;
374 inline bool SeekForGet(const Slice
& target
) {
375 if (!data_block_hash_index_
) {
380 return SeekForGetImpl(target
);
383 virtual void SeekForPrev(const Slice
& target
) override
;
385 virtual void Prev() override
;
387 virtual void Next() override
;
389 virtual void SeekToFirst() override
;
391 virtual void SeekToLast() override
;
393 void Invalidate(Status s
) {
395 // Clear prev entries cache.
396 prev_entries_keys_buff_
.clear();
397 prev_entries_
.clear();
398 prev_entries_idx_
= -1;
403 BlockReadAmpBitmap
* read_amp_bitmap_
;
404 // last `current_` value we report to read-amp bitmp
405 mutable uint32_t last_bitmap_offset_
;
406 struct CachedPrevEntry
{
407 explicit CachedPrevEntry(uint32_t _offset
, const char* _key_ptr
,
408 size_t _key_offset
, size_t _key_size
, Slice _value
)
411 key_offset(_key_offset
),
415 // offset of entry in block
417 // Pointer to key data in block (nullptr if key is delta-encoded)
419 // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
423 // value slice pointing to data in block
426 std::string prev_entries_keys_buff_
;
427 std::vector
<CachedPrevEntry
> prev_entries_
;
428 int32_t prev_entries_idx_
= -1;
430 DataBlockHashIndex
* data_block_hash_index_
;
431 const Comparator
* user_comparator_
;
433 inline bool ParseNextDataKey(const char* limit
= nullptr);
435 inline int Compare(const IterKey
& ikey
, const Slice
& b
) const {
436 return comparator_
->Compare(ikey
.GetInternalKey(), b
);
439 bool SeekForGetImpl(const Slice
& target
);
442 class IndexBlockIter final
: public BlockIter
<BlockHandle
> {
444 IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
446 virtual Slice
key() const override
{
448 return key_
.GetKey();
450 // key_includes_seq, default true, means that the keys are in internal key
452 // value_is_full, default ture, means that no delta encoding is
453 // applied to values.
454 IndexBlockIter(const Comparator
* comparator
,
455 const Comparator
* user_comparator
, const char* data
,
456 uint32_t restarts
, uint32_t num_restarts
,
457 BlockPrefixIndex
* prefix_index
, bool key_includes_seq
,
458 bool value_is_full
, bool block_contents_pinned
)
460 Initialize(comparator
, user_comparator
, data
, restarts
, num_restarts
,
461 prefix_index
, key_includes_seq
, block_contents_pinned
,
462 value_is_full
, nullptr /* data_block_hash_index */);
465 void Initialize(const Comparator
* comparator
,
466 const Comparator
* user_comparator
, const char* data
,
467 uint32_t restarts
, uint32_t num_restarts
,
468 BlockPrefixIndex
* prefix_index
, bool key_includes_seq
,
469 bool value_is_full
, bool block_contents_pinned
,
470 DataBlockHashIndex
* /*data_block_hash_index*/) {
471 InitializeBase(key_includes_seq
? comparator
: user_comparator
, data
,
472 restarts
, num_restarts
, kDisableGlobalSequenceNumber
,
473 block_contents_pinned
);
474 key_includes_seq_
= key_includes_seq
;
475 key_
.SetIsUserKey(!key_includes_seq_
);
476 prefix_index_
= prefix_index
;
477 value_delta_encoded_
= !value_is_full
;
480 virtual BlockHandle
value() const override
{
482 if (value_delta_encoded_
) {
483 return decoded_value_
;
487 Status decode_s
__attribute__((__unused__
)) = handle
.DecodeFrom(&v
);
488 assert(decode_s
.ok());
493 virtual void Seek(const Slice
& target
) override
;
495 virtual void SeekForPrev(const Slice
&) override
{
497 current_
= restarts_
;
498 restart_index_
= num_restarts_
;
499 status_
= Status::InvalidArgument(
500 "RocksDB internal error: should never call SeekForPrev() on index "
506 virtual void Prev() override
;
508 virtual void Next() override
;
510 virtual void SeekToFirst() override
;
512 virtual void SeekToLast() override
;
514 void Invalidate(Status s
) { InvalidateBase(s
); }
517 // Key is in InternalKey format
518 bool key_includes_seq_
;
519 bool value_delta_encoded_
;
520 BlockPrefixIndex
* prefix_index_
;
521 // Whether the value is delta encoded. In that case the value is assumed to be
522 // BlockHandle. The first value in each restart interval is the full encoded
523 // BlockHandle; the restart of encoded size part of the BlockHandle. The
524 // offset of delta encoded BlockHandles is computed by adding the size of
525 // previous delta encoded values in the same restart interval to the offset of
526 // the first value in that restart interval.
527 BlockHandle decoded_value_
;
529 bool PrefixSeek(const Slice
& target
, uint32_t* index
);
530 bool BinaryBlockIndexSeek(const Slice
& target
, uint32_t* block_ids
,
531 uint32_t left
, uint32_t right
,
533 inline int CompareBlockKey(uint32_t block_index
, const Slice
& target
);
535 inline int Compare(const Slice
& a
, const Slice
& b
) const {
536 return comparator_
->Compare(a
, b
);
539 inline int Compare(const IterKey
& ikey
, const Slice
& b
) const {
540 return comparator_
->Compare(ikey
.GetKey(), b
);
543 inline bool ParseNextIndexKey();
545 // When value_delta_encoded_ is enabled it decodes the value which is assumed
546 // to be BlockHandle and put it to decoded_value_
547 inline void DecodeCurrentValue(uint32_t shared
);
550 } // namespace rocksdb