]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / block.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #pragma once
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string>
14 #include <vector>
15 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
16 #ifdef OS_FREEBSD
17 #include <malloc_np.h>
18 #else
19 #include <malloc.h>
20 #endif
21 #endif
22
23 #include "db/dbformat.h"
24 #include "db/pinned_iterators_manager.h"
25 #include "format.h"
26 #include "rocksdb/iterator.h"
27 #include "rocksdb/options.h"
28 #include "rocksdb/statistics.h"
29 #include "rocksdb/table.h"
30 #include "table/block_prefix_index.h"
31 #include "table/data_block_hash_index.h"
32 #include "table/internal_iterator.h"
33 #include "util/random.h"
34 #include "util/sync_point.h"
35
36 namespace rocksdb {
37
38 struct BlockContents;
39 class Comparator;
40 template <class TValue>
41 class BlockIter;
42 class DataBlockIter;
43 class IndexBlockIter;
44 class BlockPrefixIndex;
45
46 // BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to
47 // a bitmap with ratio bytes_per_bit. Whenever we access a range of bytes in
48 // the Block we update the bitmap and increment READ_AMP_ESTIMATE_USEFUL_BYTES.
49 class BlockReadAmpBitmap {
50 public:
51 explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
52 Statistics* statistics)
53 : bitmap_(nullptr),
54 bytes_per_bit_pow_(0),
55 statistics_(statistics),
56 rnd_(
57 Random::GetTLSInstance()->Uniform(static_cast<int>(bytes_per_bit))) {
58 TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
59 assert(block_size > 0 && bytes_per_bit > 0);
60
61 // convert bytes_per_bit to be a power of 2
62 while (bytes_per_bit >>= 1) {
63 bytes_per_bit_pow_++;
64 }
65
66 // num_bits_needed = ceil(block_size / bytes_per_bit)
67 size_t num_bits_needed =
68 ((block_size - 1) >> bytes_per_bit_pow_) + 1;
69 assert(num_bits_needed > 0);
70
71 // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
72 size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
73
74 // Create bitmap and set all the bits to 0
75 bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
76
77 RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
78 }
79
80 ~BlockReadAmpBitmap() { delete[] bitmap_; }
81
82 void Mark(uint32_t start_offset, uint32_t end_offset) {
83 assert(end_offset >= start_offset);
84 // Index of first bit in mask
85 uint32_t start_bit =
86 (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
87 bytes_per_bit_pow_;
88 // Index of last bit in mask + 1
89 uint32_t exclusive_end_bit =
90 (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
91 if (start_bit >= exclusive_end_bit) {
92 return;
93 }
94 assert(exclusive_end_bit > 0);
95
96 if (GetAndSet(start_bit) == 0) {
97 uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
98 << bytes_per_bit_pow_;
99 RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
100 new_useful_bytes);
101 }
102 }
103
104 Statistics* GetStatistics() {
105 return statistics_.load(std::memory_order_relaxed);
106 }
107
108 void SetStatistics(Statistics* stats) { statistics_.store(stats); }
109
110 uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
111
112 size_t ApproximateMemoryUsage() const {
113 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
114 return malloc_usable_size((void*)this);
115 #endif // ROCKSDB_MALLOC_USABLE_SIZE
116 return sizeof(*this);
117 }
118
119 private:
120 // Get the current value of bit at `bit_idx` and set it to 1
121 inline bool GetAndSet(uint32_t bit_idx) {
122 const uint32_t byte_idx = bit_idx / kBitsPerEntry;
123 const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
124
125 return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
126 bit_mask;
127 }
128
129 const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes
130 const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits
131
132 // Bitmap used to record the bytes that we read, use atomic to protect
133 // against multiple threads updating the same bit
134 std::atomic<uint32_t>* bitmap_;
135 // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
136 // muliplication and division
137 uint8_t bytes_per_bit_pow_;
138 // Pointer to DB Statistics object, Since this bitmap may outlive the DB
139 // this pointer maybe invalid, but the DB will update it to a valid pointer
140 // by using SetStatistics() before calling Mark()
141 std::atomic<Statistics*> statistics_;
142 uint32_t rnd_;
143 };
144
145 class Block {
146 public:
147 // Initialize the block with the specified contents.
148 explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
149 size_t read_amp_bytes_per_bit = 0,
150 Statistics* statistics = nullptr);
151
152 ~Block();
153
154 size_t size() const { return size_; }
155 const char* data() const { return data_; }
156 bool cachable() const { return contents_.cachable; }
157 // The additional memory space taken by the block data.
158 size_t usable_size() const { return contents_.usable_size(); }
159 uint32_t NumRestarts() const;
160 BlockBasedTableOptions::DataBlockIndexType IndexType() const;
161 CompressionType compression_type() const {
162 return contents_.compression_type;
163 }
164
165 // If comparator is InternalKeyComparator, user_comparator is its user
166 // comparator; they are equal otherwise.
167 //
168 // If iter is null, return new Iterator
169 // If iter is not null, update this one and return it as Iterator*
170 //
171 // key_includes_seq, default true, means that the keys are in internal key
172 // format.
173 // value_is_full, default ture, means that no delta encoding is
174 // applied to values.
175 //
176 // NewIterator<DataBlockIter>
177 // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
178 //
179 // NewIterator<IndexBlockIter>
180 // If `prefix_index` is not nullptr this block will do hash lookup for the key
181 // prefix. If total_order_seek is true, prefix_index_ is ignored.
182 //
183 // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
184 // the iterator will simply be set as "invalid", rather than returning
185 // the key that is just pass the target key.
186 template <typename TBlockIter>
187 TBlockIter* NewIterator(
188 const Comparator* comparator, const Comparator* user_comparator,
189 TBlockIter* iter = nullptr, Statistics* stats = nullptr,
190 bool total_order_seek = true, bool key_includes_seq = true,
191 bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr);
192
193 // Report an approximation of how much memory has been used.
194 size_t ApproximateMemoryUsage() const;
195
196 SequenceNumber global_seqno() const { return global_seqno_; }
197
198 private:
199 BlockContents contents_;
200 const char* data_; // contents_.data.data()
201 size_t size_; // contents_.data.size()
202 uint32_t restart_offset_; // Offset in data_ of restart array
203 uint32_t num_restarts_;
204 std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
205 // All keys in the block will have seqno = global_seqno_, regardless of
206 // the encoded value (kDisableGlobalSequenceNumber means disabled)
207 const SequenceNumber global_seqno_;
208
209 DataBlockHashIndex data_block_hash_index_;
210
211 // No copying allowed
212 Block(const Block&) = delete;
213 void operator=(const Block&) = delete;
214 };
215
216 template <class TValue>
217 class BlockIter : public InternalIteratorBase<TValue> {
218 public:
219 void InitializeBase(const Comparator* comparator, const char* data,
220 uint32_t restarts, uint32_t num_restarts,
221 SequenceNumber global_seqno, bool block_contents_pinned) {
222 assert(data_ == nullptr); // Ensure it is called only once
223 assert(num_restarts > 0); // Ensure the param is valid
224
225 comparator_ = comparator;
226 data_ = data;
227 restarts_ = restarts;
228 num_restarts_ = num_restarts;
229 current_ = restarts_;
230 restart_index_ = num_restarts_;
231 global_seqno_ = global_seqno;
232 block_contents_pinned_ = block_contents_pinned;
233 }
234
235 // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
236 // nothing. Calls cleanup functions.
237 void InvalidateBase(Status s) {
238 // Assert that the BlockIter is never deleted while Pinning is Enabled.
239 assert(!pinned_iters_mgr_ ||
240 (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
241
242 data_ = nullptr;
243 current_ = restarts_;
244 status_ = s;
245
246 // Call cleanup callbacks.
247 Cleanable::Reset();
248 }
249
250 virtual bool Valid() const override { return current_ < restarts_; }
251 virtual Status status() const override { return status_; }
252 virtual Slice key() const override {
253 assert(Valid());
254 return key_.GetKey();
255 }
256
257 #ifndef NDEBUG
258 virtual ~BlockIter() {
259 // Assert that the BlockIter is never deleted while Pinning is Enabled.
260 assert(!pinned_iters_mgr_ ||
261 (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
262 }
263 virtual void SetPinnedItersMgr(
264 PinnedIteratorsManager* pinned_iters_mgr) override {
265 pinned_iters_mgr_ = pinned_iters_mgr;
266 }
267 PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
268 #endif
269
270 virtual bool IsKeyPinned() const override {
271 return block_contents_pinned_ && key_pinned_;
272 }
273
274 virtual bool IsValuePinned() const override { return block_contents_pinned_; }
275
276 size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
277
278 uint32_t ValueOffset() const {
279 return static_cast<uint32_t>(value_.data() - data_);
280 }
281
282 protected:
283 // Note: The type could be changed to InternalKeyComparator but we see a weird
284 // performance drop by that.
285 const Comparator* comparator_;
286 const char* data_; // underlying block contents
287 uint32_t num_restarts_; // Number of uint32_t entries in restart array
288
289 // Index of restart block in which current_ or current_-1 falls
290 uint32_t restart_index_;
291 uint32_t restarts_; // Offset of restart array (list of fixed32)
292 // current_ is offset in data_ of current entry. >= restarts_ if !Valid
293 uint32_t current_;
294 IterKey key_;
295 Slice value_;
296 Status status_;
297 bool key_pinned_;
298 // whether the block data is guaranteed to outlive this iterator
299 bool block_contents_pinned_;
300 SequenceNumber global_seqno_;
301
302 public:
303 // Return the offset in data_ just past the end of the current entry.
304 inline uint32_t NextEntryOffset() const {
305 // NOTE: We don't support blocks bigger than 2GB
306 return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
307 }
308
309 uint32_t GetRestartPoint(uint32_t index) {
310 assert(index < num_restarts_);
311 return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
312 }
313
314 void SeekToRestartPoint(uint32_t index) {
315 key_.Clear();
316 restart_index_ = index;
317 // current_ will be fixed by ParseNextKey();
318
319 // ParseNextKey() starts at the end of value_, so set value_ accordingly
320 uint32_t offset = GetRestartPoint(index);
321 value_ = Slice(data_ + offset, 0);
322 }
323
324 void CorruptionError();
325
326 template <typename DecodeKeyFunc>
327 inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
328 uint32_t* index, const Comparator* comp);
329 };
330
331 class DataBlockIter final : public BlockIter<Slice> {
332 public:
333 DataBlockIter()
334 : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
335 DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
336 const char* data, uint32_t restarts, uint32_t num_restarts,
337 SequenceNumber global_seqno,
338 BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
339 DataBlockHashIndex* data_block_hash_index)
340 : DataBlockIter() {
341 Initialize(comparator, user_comparator, data, restarts, num_restarts,
342 global_seqno, read_amp_bitmap, block_contents_pinned,
343 data_block_hash_index);
344 }
345 void Initialize(const Comparator* comparator,
346 const Comparator* user_comparator, const char* data,
347 uint32_t restarts, uint32_t num_restarts,
348 SequenceNumber global_seqno,
349 BlockReadAmpBitmap* read_amp_bitmap,
350 bool block_contents_pinned,
351 DataBlockHashIndex* data_block_hash_index) {
352 InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
353 block_contents_pinned);
354 user_comparator_ = user_comparator;
355 key_.SetIsUserKey(false);
356 read_amp_bitmap_ = read_amp_bitmap;
357 last_bitmap_offset_ = current_ + 1;
358 data_block_hash_index_ = data_block_hash_index;
359 }
360
361 virtual Slice value() const override {
362 assert(Valid());
363 if (read_amp_bitmap_ && current_ < restarts_ &&
364 current_ != last_bitmap_offset_) {
365 read_amp_bitmap_->Mark(current_ /* current entry offset */,
366 NextEntryOffset() - 1);
367 last_bitmap_offset_ = current_;
368 }
369 return value_;
370 }
371
372 virtual void Seek(const Slice& target) override;
373
374 inline bool SeekForGet(const Slice& target) {
375 if (!data_block_hash_index_) {
376 Seek(target);
377 return true;
378 }
379
380 return SeekForGetImpl(target);
381 }
382
383 virtual void SeekForPrev(const Slice& target) override;
384
385 virtual void Prev() override;
386
387 virtual void Next() override;
388
389 virtual void SeekToFirst() override;
390
391 virtual void SeekToLast() override;
392
393 void Invalidate(Status s) {
394 InvalidateBase(s);
395 // Clear prev entries cache.
396 prev_entries_keys_buff_.clear();
397 prev_entries_.clear();
398 prev_entries_idx_ = -1;
399 }
400
401 private:
402 // read-amp bitmap
403 BlockReadAmpBitmap* read_amp_bitmap_;
404 // last `current_` value we report to read-amp bitmp
405 mutable uint32_t last_bitmap_offset_;
406 struct CachedPrevEntry {
407 explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
408 size_t _key_offset, size_t _key_size, Slice _value)
409 : offset(_offset),
410 key_ptr(_key_ptr),
411 key_offset(_key_offset),
412 key_size(_key_size),
413 value(_value) {}
414
415 // offset of entry in block
416 uint32_t offset;
417 // Pointer to key data in block (nullptr if key is delta-encoded)
418 const char* key_ptr;
419 // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
420 size_t key_offset;
421 // size of key
422 size_t key_size;
423 // value slice pointing to data in block
424 Slice value;
425 };
426 std::string prev_entries_keys_buff_;
427 std::vector<CachedPrevEntry> prev_entries_;
428 int32_t prev_entries_idx_ = -1;
429
430 DataBlockHashIndex* data_block_hash_index_;
431 const Comparator* user_comparator_;
432
433 inline bool ParseNextDataKey(const char* limit = nullptr);
434
435 inline int Compare(const IterKey& ikey, const Slice& b) const {
436 return comparator_->Compare(ikey.GetInternalKey(), b);
437 }
438
439 bool SeekForGetImpl(const Slice& target);
440 };
441
442 class IndexBlockIter final : public BlockIter<BlockHandle> {
443 public:
444 IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
445
446 virtual Slice key() const override {
447 assert(Valid());
448 return key_.GetKey();
449 }
450 // key_includes_seq, default true, means that the keys are in internal key
451 // format.
452 // value_is_full, default ture, means that no delta encoding is
453 // applied to values.
454 IndexBlockIter(const Comparator* comparator,
455 const Comparator* user_comparator, const char* data,
456 uint32_t restarts, uint32_t num_restarts,
457 BlockPrefixIndex* prefix_index, bool key_includes_seq,
458 bool value_is_full, bool block_contents_pinned)
459 : IndexBlockIter() {
460 Initialize(comparator, user_comparator, data, restarts, num_restarts,
461 prefix_index, key_includes_seq, block_contents_pinned,
462 value_is_full, nullptr /* data_block_hash_index */);
463 }
464
465 void Initialize(const Comparator* comparator,
466 const Comparator* user_comparator, const char* data,
467 uint32_t restarts, uint32_t num_restarts,
468 BlockPrefixIndex* prefix_index, bool key_includes_seq,
469 bool value_is_full, bool block_contents_pinned,
470 DataBlockHashIndex* /*data_block_hash_index*/) {
471 InitializeBase(key_includes_seq ? comparator : user_comparator, data,
472 restarts, num_restarts, kDisableGlobalSequenceNumber,
473 block_contents_pinned);
474 key_includes_seq_ = key_includes_seq;
475 key_.SetIsUserKey(!key_includes_seq_);
476 prefix_index_ = prefix_index;
477 value_delta_encoded_ = !value_is_full;
478 }
479
480 virtual BlockHandle value() const override {
481 assert(Valid());
482 if (value_delta_encoded_) {
483 return decoded_value_;
484 } else {
485 BlockHandle handle;
486 Slice v = value_;
487 Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
488 assert(decode_s.ok());
489 return handle;
490 }
491 }
492
493 virtual void Seek(const Slice& target) override;
494
495 virtual void SeekForPrev(const Slice&) override {
496 assert(false);
497 current_ = restarts_;
498 restart_index_ = num_restarts_;
499 status_ = Status::InvalidArgument(
500 "RocksDB internal error: should never call SeekForPrev() on index "
501 "blocks");
502 key_.Clear();
503 value_.clear();
504 }
505
506 virtual void Prev() override;
507
508 virtual void Next() override;
509
510 virtual void SeekToFirst() override;
511
512 virtual void SeekToLast() override;
513
514 void Invalidate(Status s) { InvalidateBase(s); }
515
516 private:
517 // Key is in InternalKey format
518 bool key_includes_seq_;
519 bool value_delta_encoded_;
520 BlockPrefixIndex* prefix_index_;
521 // Whether the value is delta encoded. In that case the value is assumed to be
522 // BlockHandle. The first value in each restart interval is the full encoded
523 // BlockHandle; the restart of encoded size part of the BlockHandle. The
524 // offset of delta encoded BlockHandles is computed by adding the size of
525 // previous delta encoded values in the same restart interval to the offset of
526 // the first value in that restart interval.
527 BlockHandle decoded_value_;
528
529 bool PrefixSeek(const Slice& target, uint32_t* index);
530 bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
531 uint32_t left, uint32_t right,
532 uint32_t* index);
533 inline int CompareBlockKey(uint32_t block_index, const Slice& target);
534
535 inline int Compare(const Slice& a, const Slice& b) const {
536 return comparator_->Compare(a, b);
537 }
538
539 inline int Compare(const IterKey& ikey, const Slice& b) const {
540 return comparator_->Compare(ikey.GetKey(), b);
541 }
542
543 inline bool ParseNextIndexKey();
544
545 // When value_delta_encoded_ is enabled it decodes the value which is assumed
546 // to be BlockHandle and put it to decoded_value_
547 inline void DecodeCurrentValue(uint32_t shared);
548 };
549
550 } // namespace rocksdb