]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / table / block.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11
12 #include "table/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_prefix_index.h"
23 #include "table/format.h"
24 #include "util/coding.h"
25 #include "util/logging.h"
26
27 namespace rocksdb {
28
29 // Helper routine: decode the next block entry starting at "p",
30 // storing the number of shared key bytes, non_shared key bytes,
31 // and the length of the value in "*shared", "*non_shared", and
32 // "*value_length", respectively. Will not derefence past "limit".
33 //
34 // If any errors are detected, returns nullptr. Otherwise, returns a
35 // pointer to the key delta (just past the three decoded values).
36 static inline const char* DecodeEntry(const char* p, const char* limit,
37 uint32_t* shared,
38 uint32_t* non_shared,
39 uint32_t* value_length) {
40 if (limit - p < 3) return nullptr;
41 *shared = reinterpret_cast<const unsigned char*>(p)[0];
42 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
43 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
44 if ((*shared | *non_shared | *value_length) < 128) {
45 // Fast path: all three values are encoded in one byte each
46 p += 3;
47 } else {
48 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
49 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
50 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
51 }
52
53 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
54 return nullptr;
55 }
56 return p;
57 }
58
59 void BlockIter::Next() {
60 assert(Valid());
61 ParseNextKey();
62 }
63
64 void BlockIter::Prev() {
65 assert(Valid());
66
67 assert(prev_entries_idx_ == -1 ||
68 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
69 // Check if we can use cached prev_entries_
70 if (prev_entries_idx_ > 0 &&
71 prev_entries_[prev_entries_idx_].offset == current_) {
72 // Read cached CachedPrevEntry
73 prev_entries_idx_--;
74 const CachedPrevEntry& current_prev_entry =
75 prev_entries_[prev_entries_idx_];
76
77 const char* key_ptr = nullptr;
78 if (current_prev_entry.key_ptr != nullptr) {
79 // The key is not delta encoded and stored in the data block
80 key_ptr = current_prev_entry.key_ptr;
81 key_pinned_ = true;
82 } else {
83 // The key is delta encoded and stored in prev_entries_keys_buff_
84 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
85 key_pinned_ = false;
86 }
87 const Slice current_key(key_ptr, current_prev_entry.key_size);
88
89 current_ = current_prev_entry.offset;
90 key_.SetInternalKey(current_key, false /* copy */);
91 value_ = current_prev_entry.value;
92
93 return;
94 }
95
96 // Clear prev entries cache
97 prev_entries_idx_ = -1;
98 prev_entries_.clear();
99 prev_entries_keys_buff_.clear();
100
101 // Scan backwards to a restart point before current_
102 const uint32_t original = current_;
103 while (GetRestartPoint(restart_index_) >= original) {
104 if (restart_index_ == 0) {
105 // No more entries
106 current_ = restarts_;
107 restart_index_ = num_restarts_;
108 return;
109 }
110 restart_index_--;
111 }
112
113 SeekToRestartPoint(restart_index_);
114
115 do {
116 if (!ParseNextKey()) {
117 break;
118 }
119 Slice current_key = key();
120
121 if (key_.IsKeyPinned()) {
122 // The key is not delta encoded
123 prev_entries_.emplace_back(current_, current_key.data(), 0,
124 current_key.size(), value());
125 } else {
126 // The key is delta encoded, cache decoded key in buffer
127 size_t new_key_offset = prev_entries_keys_buff_.size();
128 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
129
130 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
131 current_key.size(), value());
132 }
133 // Loop until end of current entry hits the start of original entry
134 } while (NextEntryOffset() < original);
135 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
136 }
137
138 void BlockIter::Seek(const Slice& target) {
139 PERF_TIMER_GUARD(block_seek_nanos);
140 if (data_ == nullptr) { // Not init yet
141 return;
142 }
143 uint32_t index = 0;
144 bool ok = false;
145 if (prefix_index_) {
146 ok = PrefixSeek(target, &index);
147 } else {
148 ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
149 }
150
151 if (!ok) {
152 return;
153 }
154 SeekToRestartPoint(index);
155 // Linear search (within restart block) for first key >= target
156
157 while (true) {
158 if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) {
159 return;
160 }
161 }
162 }
163
164 void BlockIter::SeekForPrev(const Slice& target) {
165 PERF_TIMER_GUARD(block_seek_nanos);
166 if (data_ == nullptr) { // Not init yet
167 return;
168 }
169 uint32_t index = 0;
170 bool ok = false;
171 ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
172
173 if (!ok) {
174 return;
175 }
176 SeekToRestartPoint(index);
177 // Linear search (within restart block) for first key >= target
178
179 while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) {
180 }
181 if (!Valid()) {
182 SeekToLast();
183 } else {
184 while (Valid() && Compare(key_.GetInternalKey(), target) > 0) {
185 Prev();
186 }
187 }
188 }
189
190 void BlockIter::SeekToFirst() {
191 if (data_ == nullptr) { // Not init yet
192 return;
193 }
194 SeekToRestartPoint(0);
195 ParseNextKey();
196 }
197
198 void BlockIter::SeekToLast() {
199 if (data_ == nullptr) { // Not init yet
200 return;
201 }
202 SeekToRestartPoint(num_restarts_ - 1);
203 while (ParseNextKey() && NextEntryOffset() < restarts_) {
204 // Keep skipping
205 }
206 }
207
208 void BlockIter::CorruptionError() {
209 current_ = restarts_;
210 restart_index_ = num_restarts_;
211 status_ = Status::Corruption("bad entry in block");
212 key_.Clear();
213 value_.clear();
214 }
215
216 bool BlockIter::ParseNextKey() {
217 current_ = NextEntryOffset();
218 const char* p = data_ + current_;
219 const char* limit = data_ + restarts_; // Restarts come right after data
220 if (p >= limit) {
221 // No more entries to return. Mark as invalid.
222 current_ = restarts_;
223 restart_index_ = num_restarts_;
224 return false;
225 }
226
227 // Decode next entry
228 uint32_t shared, non_shared, value_length;
229 p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
230 if (p == nullptr || key_.Size() < shared) {
231 CorruptionError();
232 return false;
233 } else {
234 if (shared == 0) {
235 // If this key dont share any bytes with prev key then we dont need
236 // to decode it and can use it's address in the block directly.
237 key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
238 key_pinned_ = true;
239 } else {
240 // This key share `shared` bytes with prev key, we need to decode it
241 key_.TrimAppend(shared, p, non_shared);
242 key_pinned_ = false;
243 }
244
245 if (global_seqno_ != kDisableGlobalSequenceNumber) {
246 // If we are reading a file with a global sequence number we should
247 // expect that all encoded sequence numbers are zeros and all value
248 // types are kTypeValue
249 assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
250 assert(ExtractValueType(key_.GetInternalKey()) == ValueType::kTypeValue);
251
252 if (key_pinned_) {
253 // TODO(tec): Investigate updating the seqno in the loaded block
254 // directly instead of doing a copy and update.
255
256 // We cannot use the key address in the block directly because
257 // we have a global_seqno_ that will overwrite the encoded one.
258 key_.OwnKey();
259 key_pinned_ = false;
260 }
261
262 key_.UpdateInternalKey(global_seqno_, ValueType::kTypeValue);
263 }
264
265 value_ = Slice(p + non_shared, value_length);
266 while (restart_index_ + 1 < num_restarts_ &&
267 GetRestartPoint(restart_index_ + 1) < current_) {
268 ++restart_index_;
269 }
270 return true;
271 }
272 }
273
274 // Binary search in restart array to find the first restart point that
275 // is either the last restart point with a key less than target,
276 // which means the key of next restart point is larger than target, or
277 // the first restart point with a key = target
278 bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
279 uint32_t* index) {
280 assert(left <= right);
281
282 while (left < right) {
283 uint32_t mid = (left + right + 1) / 2;
284 uint32_t region_offset = GetRestartPoint(mid);
285 uint32_t shared, non_shared, value_length;
286 const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
287 &shared, &non_shared, &value_length);
288 if (key_ptr == nullptr || (shared != 0)) {
289 CorruptionError();
290 return false;
291 }
292 Slice mid_key(key_ptr, non_shared);
293 int cmp = Compare(mid_key, target);
294 if (cmp < 0) {
295 // Key at "mid" is smaller than "target". Therefore all
296 // blocks before "mid" are uninteresting.
297 left = mid;
298 } else if (cmp > 0) {
299 // Key at "mid" is >= "target". Therefore all blocks at or
300 // after "mid" are uninteresting.
301 right = mid - 1;
302 } else {
303 left = right = mid;
304 }
305 }
306
307 *index = left;
308 return true;
309 }
310
311 // Compare target key and the block key of the block of `block_index`.
312 // Return -1 if error.
313 int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
314 uint32_t region_offset = GetRestartPoint(block_index);
315 uint32_t shared, non_shared, value_length;
316 const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
317 &shared, &non_shared, &value_length);
318 if (key_ptr == nullptr || (shared != 0)) {
319 CorruptionError();
320 return 1; // Return target is smaller
321 }
322 Slice block_key(key_ptr, non_shared);
323 return Compare(block_key, target);
324 }
325
326 // Binary search in block_ids to find the first block
327 // with a key >= target
328 bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
329 uint32_t left, uint32_t right,
330 uint32_t* index) {
331 assert(left <= right);
332 uint32_t left_bound = left;
333
334 while (left <= right) {
335 uint32_t mid = (right + left) / 2;
336
337 int cmp = CompareBlockKey(block_ids[mid], target);
338 if (!status_.ok()) {
339 return false;
340 }
341 if (cmp < 0) {
342 // Key at "target" is larger than "mid". Therefore all
343 // blocks before or at "mid" are uninteresting.
344 left = mid + 1;
345 } else {
346 // Key at "target" is <= "mid". Therefore all blocks
347 // after "mid" are uninteresting.
348 // If there is only one block left, we found it.
349 if (left == right) break;
350 right = mid;
351 }
352 }
353
354 if (left == right) {
355 // In one of the two following cases:
356 // (1) left is the first one of block_ids
357 // (2) there is a gap of blocks between block of `left` and `left-1`.
358 // we can further distinguish the case of key in the block or key not
359 // existing, by comparing the target key and the key of the previous
360 // block to the left of the block found.
361 if (block_ids[left] > 0 &&
362 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
363 CompareBlockKey(block_ids[left] - 1, target) > 0) {
364 current_ = restarts_;
365 return false;
366 }
367
368 *index = block_ids[left];
369 return true;
370 } else {
371 assert(left > right);
372 // Mark iterator invalid
373 current_ = restarts_;
374 return false;
375 }
376 }
377
378 bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
379 assert(prefix_index_);
380 uint32_t* block_ids = nullptr;
381 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
382
383 if (num_blocks == 0) {
384 current_ = restarts_;
385 return false;
386 } else {
387 return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
388 }
389 }
390
391 uint32_t Block::NumRestarts() const {
392 assert(size_ >= 2*sizeof(uint32_t));
393 return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
394 }
395
396 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
397 size_t read_amp_bytes_per_bit, Statistics* statistics)
398 : contents_(std::move(contents)),
399 data_(contents_.data.data()),
400 size_(contents_.data.size()),
401 global_seqno_(_global_seqno) {
402 if (size_ < sizeof(uint32_t)) {
403 size_ = 0; // Error marker
404 } else {
405 restart_offset_ =
406 static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
407 if (restart_offset_ > size_ - sizeof(uint32_t)) {
408 // The size is too small for NumRestarts() and therefore
409 // restart_offset_ wrapped around.
410 size_ = 0;
411 }
412 }
413 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
414 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
415 restart_offset_, read_amp_bytes_per_bit, statistics));
416 }
417 }
418
419 InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
420 bool total_order_seek, Statistics* stats) {
421 if (size_ < 2*sizeof(uint32_t)) {
422 if (iter != nullptr) {
423 iter->SetStatus(Status::Corruption("bad block contents"));
424 return iter;
425 } else {
426 return NewErrorInternalIterator(Status::Corruption("bad block contents"));
427 }
428 }
429 const uint32_t num_restarts = NumRestarts();
430 if (num_restarts == 0) {
431 if (iter != nullptr) {
432 iter->SetStatus(Status::OK());
433 return iter;
434 } else {
435 return NewEmptyInternalIterator();
436 }
437 } else {
438 BlockPrefixIndex* prefix_index_ptr =
439 total_order_seek ? nullptr : prefix_index_.get();
440
441 if (iter != nullptr) {
442 iter->Initialize(cmp, data_, restart_offset_, num_restarts,
443 prefix_index_ptr, global_seqno_, read_amp_bitmap_.get());
444 } else {
445 iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
446 prefix_index_ptr, global_seqno_,
447 read_amp_bitmap_.get());
448 }
449
450 if (read_amp_bitmap_) {
451 if (read_amp_bitmap_->GetStatistics() != stats) {
452 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
453 read_amp_bitmap_->SetStatistics(stats);
454 }
455 }
456 }
457
458 return iter;
459 }
460
461 void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
462 prefix_index_.reset(prefix_index);
463 }
464
465 size_t Block::ApproximateMemoryUsage() const {
466 size_t usage = usable_size();
467 if (prefix_index_) {
468 usage += prefix_index_->ApproximateMemoryUsage();
469 }
470 return usage;
471 }
472
473 } // namespace rocksdb