]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | // | |
10 | // Decodes the blocks generated by block_builder.cc. | |
11 | ||
12 | #include "table/block.h" | |
13 | #include <algorithm> | |
14 | #include <string> | |
15 | #include <unordered_map> | |
16 | #include <vector> | |
17 | ||
18 | #include "monitoring/perf_context_imp.h" | |
19 | #include "port/port.h" | |
20 | #include "port/stack_trace.h" | |
21 | #include "rocksdb/comparator.h" | |
22 | #include "table/block_prefix_index.h" | |
11fdf7f2 | 23 | #include "table/data_block_footer.h" |
7c673cae FG |
24 | #include "table/format.h" |
25 | #include "util/coding.h" | |
26 | #include "util/logging.h" | |
27 | ||
28 | namespace rocksdb { | |
29 | ||
30 | // Helper routine: decode the next block entry starting at "p", | |
31 | // storing the number of shared key bytes, non_shared key bytes, | |
32 | // and the length of the value in "*shared", "*non_shared", and | |
33 | // "*value_length", respectively. Will not derefence past "limit". | |
34 | // | |
35 | // If any errors are detected, returns nullptr. Otherwise, returns a | |
36 | // pointer to the key delta (just past the three decoded values). | |
11fdf7f2 TL |
37 | struct DecodeEntry { |
38 | inline const char* operator()(const char* p, const char* limit, | |
39 | uint32_t* shared, uint32_t* non_shared, | |
40 | uint32_t* value_length) { | |
41 | // We need 2 bytes for shared and non_shared size. We also need one more | |
42 | // byte either for value size or the actual value in case of value delta | |
43 | // encoding. | |
44 | assert(limit - p >= 3); | |
45 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
46 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
47 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; | |
48 | if ((*shared | *non_shared | *value_length) < 128) { | |
49 | // Fast path: all three values are encoded in one byte each | |
50 | p += 3; | |
51 | } else { | |
52 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
53 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
54 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { | |
55 | return nullptr; | |
56 | } | |
57 | } | |
58 | ||
59 | // Using an assert in place of "return null" since we should not pay the | |
60 | // cost of checking for corruption on every single key decoding | |
61 | assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length))); | |
62 | return p; | |
7c673cae | 63 | } |
11fdf7f2 | 64 | }; |
7c673cae | 65 | |
494da23a TL |
66 | // Helper routine: similar to DecodeEntry but does not have assertions. |
67 | // Instead, returns nullptr so that caller can detect and report failure. | |
68 | struct CheckAndDecodeEntry { | |
69 | inline const char* operator()(const char* p, const char* limit, | |
70 | uint32_t* shared, uint32_t* non_shared, | |
71 | uint32_t* value_length) { | |
72 | // We need 2 bytes for shared and non_shared size. We also need one more | |
73 | // byte either for value size or the actual value in case of value delta | |
74 | // encoding. | |
75 | if (limit - p < 3) { | |
76 | return nullptr; | |
77 | } | |
78 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
79 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
80 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; | |
81 | if ((*shared | *non_shared | *value_length) < 128) { | |
82 | // Fast path: all three values are encoded in one byte each | |
83 | p += 3; | |
84 | } else { | |
85 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
86 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
87 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { | |
88 | return nullptr; | |
89 | } | |
90 | } | |
91 | ||
92 | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { | |
93 | return nullptr; | |
94 | } | |
95 | return p; | |
96 | } | |
97 | }; | |
98 | ||
11fdf7f2 TL |
99 | struct DecodeKey { |
100 | inline const char* operator()(const char* p, const char* limit, | |
101 | uint32_t* shared, uint32_t* non_shared) { | |
102 | uint32_t value_length; | |
103 | return DecodeEntry()(p, limit, shared, non_shared, &value_length); | |
7c673cae | 104 | } |
11fdf7f2 TL |
105 | }; |
106 | ||
107 | // In format_version 4, which is used by index blocks, the value size is not | |
108 | // encoded before the entry, as the value is known to be the handle with the | |
109 | // known size. | |
110 | struct DecodeKeyV4 { | |
111 | inline const char* operator()(const char* p, const char* limit, | |
112 | uint32_t* shared, uint32_t* non_shared) { | |
113 | // We need 2 bytes for shared and non_shared size. We also need one more | |
114 | // byte either for value size or the actual value in case of value delta | |
115 | // encoding. | |
116 | if (limit - p < 3) return nullptr; | |
117 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
118 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
119 | if ((*shared | *non_shared) < 128) { | |
120 | // Fast path: all three values are encoded in one byte each | |
121 | p += 2; | |
122 | } else { | |
123 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
124 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
125 | } | |
126 | return p; | |
127 | } | |
128 | }; | |
129 | ||
130 | void DataBlockIter::Next() { | |
131 | assert(Valid()); | |
494da23a TL |
132 | ParseNextDataKey<DecodeEntry>(); |
133 | } | |
134 | ||
135 | void DataBlockIter::NextOrReport() { | |
136 | assert(Valid()); | |
137 | ParseNextDataKey<CheckAndDecodeEntry>(); | |
7c673cae FG |
138 | } |
139 | ||
11fdf7f2 | 140 | void IndexBlockIter::Next() { |
7c673cae | 141 | assert(Valid()); |
11fdf7f2 | 142 | ParseNextIndexKey(); |
7c673cae FG |
143 | } |
144 | ||
11fdf7f2 TL |
145 | void IndexBlockIter::Prev() { |
146 | assert(Valid()); | |
147 | // Scan backwards to a restart point before current_ | |
148 | const uint32_t original = current_; | |
149 | while (GetRestartPoint(restart_index_) >= original) { | |
150 | if (restart_index_ == 0) { | |
151 | // No more entries | |
152 | current_ = restarts_; | |
153 | restart_index_ = num_restarts_; | |
154 | return; | |
155 | } | |
156 | restart_index_--; | |
157 | } | |
158 | SeekToRestartPoint(restart_index_); | |
159 | do { | |
160 | if (!ParseNextIndexKey()) { | |
161 | break; | |
162 | } | |
163 | // Loop until end of current entry hits the start of original entry | |
164 | } while (NextEntryOffset() < original); | |
165 | } | |
166 | ||
167 | // Similar to IndexBlockIter::Prev but also caches the prev entries | |
168 | void DataBlockIter::Prev() { | |
7c673cae FG |
169 | assert(Valid()); |
170 | ||
171 | assert(prev_entries_idx_ == -1 || | |
172 | static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); | |
173 | // Check if we can use cached prev_entries_ | |
174 | if (prev_entries_idx_ > 0 && | |
175 | prev_entries_[prev_entries_idx_].offset == current_) { | |
176 | // Read cached CachedPrevEntry | |
177 | prev_entries_idx_--; | |
178 | const CachedPrevEntry& current_prev_entry = | |
179 | prev_entries_[prev_entries_idx_]; | |
180 | ||
181 | const char* key_ptr = nullptr; | |
182 | if (current_prev_entry.key_ptr != nullptr) { | |
183 | // The key is not delta encoded and stored in the data block | |
184 | key_ptr = current_prev_entry.key_ptr; | |
185 | key_pinned_ = true; | |
186 | } else { | |
187 | // The key is delta encoded and stored in prev_entries_keys_buff_ | |
188 | key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; | |
189 | key_pinned_ = false; | |
190 | } | |
191 | const Slice current_key(key_ptr, current_prev_entry.key_size); | |
192 | ||
193 | current_ = current_prev_entry.offset; | |
11fdf7f2 | 194 | key_.SetKey(current_key, false /* copy */); |
7c673cae FG |
195 | value_ = current_prev_entry.value; |
196 | ||
197 | return; | |
198 | } | |
199 | ||
200 | // Clear prev entries cache | |
201 | prev_entries_idx_ = -1; | |
202 | prev_entries_.clear(); | |
203 | prev_entries_keys_buff_.clear(); | |
204 | ||
205 | // Scan backwards to a restart point before current_ | |
206 | const uint32_t original = current_; | |
207 | while (GetRestartPoint(restart_index_) >= original) { | |
208 | if (restart_index_ == 0) { | |
209 | // No more entries | |
210 | current_ = restarts_; | |
211 | restart_index_ = num_restarts_; | |
212 | return; | |
213 | } | |
214 | restart_index_--; | |
215 | } | |
216 | ||
217 | SeekToRestartPoint(restart_index_); | |
218 | ||
219 | do { | |
494da23a | 220 | if (!ParseNextDataKey<DecodeEntry>()) { |
7c673cae FG |
221 | break; |
222 | } | |
223 | Slice current_key = key(); | |
224 | ||
225 | if (key_.IsKeyPinned()) { | |
226 | // The key is not delta encoded | |
227 | prev_entries_.emplace_back(current_, current_key.data(), 0, | |
228 | current_key.size(), value()); | |
229 | } else { | |
230 | // The key is delta encoded, cache decoded key in buffer | |
231 | size_t new_key_offset = prev_entries_keys_buff_.size(); | |
232 | prev_entries_keys_buff_.append(current_key.data(), current_key.size()); | |
233 | ||
234 | prev_entries_.emplace_back(current_, nullptr, new_key_offset, | |
235 | current_key.size(), value()); | |
236 | } | |
237 | // Loop until end of current entry hits the start of original entry | |
238 | } while (NextEntryOffset() < original); | |
239 | prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; | |
240 | } | |
241 | ||
11fdf7f2 TL |
242 | void DataBlockIter::Seek(const Slice& target) { |
243 | Slice seek_key = target; | |
244 | PERF_TIMER_GUARD(block_seek_nanos); | |
245 | if (data_ == nullptr) { // Not init yet | |
246 | return; | |
247 | } | |
248 | uint32_t index = 0; | |
249 | bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, | |
250 | comparator_); | |
251 | ||
252 | if (!ok) { | |
253 | return; | |
254 | } | |
255 | SeekToRestartPoint(index); | |
256 | // Linear search (within restart block) for first key >= target | |
257 | ||
258 | while (true) { | |
494da23a | 259 | if (!ParseNextDataKey<DecodeEntry>() || Compare(key_, seek_key) >= 0) { |
11fdf7f2 TL |
260 | return; |
261 | } | |
262 | } | |
263 | } | |
264 | ||
265 | // Optimized Seek for point lookup for an internal key `target` | |
266 | // target = "seek_user_key @ type | seqno". | |
267 | // | |
268 | // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, | |
269 | // or kTypeBlobIndex, this function behaves identically as Seek(). | |
270 | // | |
271 | // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, | |
272 | // or kTypeBlobIndex: | |
273 | // | |
274 | // If the return value is FALSE, iter location is undefined, and it means: | |
275 | // 1) there is no key in this block falling into the range: | |
276 | // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], | |
277 | // inclusive; AND | |
278 | // 2) the last key of this block has a greater user_key from seek_user_key | |
279 | // | |
280 | // If the return value is TRUE, iter location has two possibilies: | |
281 | // 1) If iter is valid, it is set to a location as if set by BinarySeek. In | |
282 | // this case, it points to the first key_ with a larger user_key or a | |
283 | // matching user_key with a seqno no greater than the seeking seqno. | |
284 | // 2) If the iter is invalid, it means that either all the user_key is less | |
285 | // than the seek_user_key, or the block ends with a matching user_key but | |
286 | // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno | |
287 | // but larger type). | |
288 | bool DataBlockIter::SeekForGetImpl(const Slice& target) { | |
289 | Slice user_key = ExtractUserKey(target); | |
290 | uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); | |
291 | uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key); | |
292 | ||
293 | if (entry == kCollision) { | |
294 | // HashSeek not effective, falling back | |
295 | Seek(target); | |
296 | return true; | |
297 | } | |
298 | ||
299 | if (entry == kNoEntry) { | |
300 | // Even if we cannot find the user_key in this block, the result may | |
301 | // exist in the next block. Consider this exmpale: | |
302 | // | |
303 | // Block N: [aab@100, ... , app@120] | |
304 | // bounary key: axy@50 (we make minimal assumption about a boundary key) | |
305 | // Block N+1: [axy@10, ... ] | |
306 | // | |
307 | // If seek_key = axy@60, the search will starts from Block N. | |
308 | // Even if the user_key is not found in the hash map, the caller still | |
309 | // have to conntinue searching the next block. | |
310 | // | |
311 | // In this case, we pretend the key is the the last restart interval. | |
312 | // The while-loop below will search the last restart interval for the | |
313 | // key. It will stop at the first key that is larger than the seek_key, | |
314 | // or to the end of the block if no one is larger. | |
315 | entry = static_cast<uint8_t>(num_restarts_ - 1); | |
316 | } | |
317 | ||
318 | uint32_t restart_index = entry; | |
319 | ||
320 | // check if the key is in the restart_interval | |
321 | assert(restart_index < num_restarts_); | |
322 | SeekToRestartPoint(restart_index); | |
323 | ||
324 | const char* limit = nullptr; | |
325 | if (restart_index_ + 1 < num_restarts_) { | |
326 | limit = data_ + GetRestartPoint(restart_index_ + 1); | |
327 | } else { | |
328 | limit = data_ + restarts_; | |
329 | } | |
330 | ||
331 | while (true) { | |
332 | // Here we only linear seek the target key inside the restart interval. | |
333 | // If a key does not exist inside a restart interval, we avoid | |
334 | // further searching the block content accross restart interval boundary. | |
335 | // | |
336 | // TODO(fwu): check the left and write boundary of the restart interval | |
337 | // to avoid linear seek a target key that is out of range. | |
494da23a | 338 | if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) { |
11fdf7f2 TL |
339 | // we stop at the first potential matching user key. |
340 | break; | |
341 | } | |
342 | } | |
343 | ||
344 | if (current_ == restarts_) { | |
345 | // Search reaches to the end of the block. There are three possibilites: | |
346 | // 1) there is only one user_key match in the block (otherwise collsion). | |
347 | // the matching user_key resides in the last restart interval, and it | |
348 | // is the last key of the restart interval and of the block as well. | |
349 | // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. | |
350 | // | |
351 | // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, | |
352 | // AND all existing user_keys in the restart interval are smaller than | |
353 | // seek_user_key. | |
354 | // | |
355 | // 3) The seek_key is a false positive and happens to be hashed to the | |
356 | // last restart interval, AND all existing user_keys in the restart | |
357 | // interval are smaller than seek_user_key. | |
358 | // | |
359 | // The result may exist in the next block each case, so we return true. | |
360 | return true; | |
361 | } | |
362 | ||
363 | if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) { | |
364 | // the key is not in this block and cannot be at the next block either. | |
365 | return false; | |
366 | } | |
367 | ||
368 | // Here we are conservative and only support a limited set of cases | |
369 | ValueType value_type = ExtractValueType(key_.GetKey()); | |
370 | if (value_type != ValueType::kTypeValue && | |
371 | value_type != ValueType::kTypeDeletion && | |
372 | value_type != ValueType::kTypeSingleDeletion && | |
373 | value_type != ValueType::kTypeBlobIndex) { | |
374 | Seek(target); | |
375 | return true; | |
376 | } | |
377 | ||
378 | // Result found, and the iter is correctly set. | |
379 | return true; | |
380 | } | |
381 | ||
382 | void IndexBlockIter::Seek(const Slice& target) { | |
383 | Slice seek_key = target; | |
384 | if (!key_includes_seq_) { | |
385 | seek_key = ExtractUserKey(target); | |
386 | } | |
7c673cae FG |
387 | PERF_TIMER_GUARD(block_seek_nanos); |
388 | if (data_ == nullptr) { // Not init yet | |
389 | return; | |
390 | } | |
391 | uint32_t index = 0; | |
392 | bool ok = false; | |
393 | if (prefix_index_) { | |
394 | ok = PrefixSeek(target, &index); | |
11fdf7f2 TL |
395 | } else if (value_delta_encoded_) { |
396 | ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index, | |
397 | comparator_); | |
7c673cae | 398 | } else { |
11fdf7f2 TL |
399 | ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, |
400 | comparator_); | |
7c673cae FG |
401 | } |
402 | ||
403 | if (!ok) { | |
404 | return; | |
405 | } | |
406 | SeekToRestartPoint(index); | |
407 | // Linear search (within restart block) for first key >= target | |
408 | ||
409 | while (true) { | |
11fdf7f2 | 410 | if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) { |
7c673cae FG |
411 | return; |
412 | } | |
413 | } | |
414 | } | |
415 | ||
11fdf7f2 | 416 | void DataBlockIter::SeekForPrev(const Slice& target) { |
7c673cae | 417 | PERF_TIMER_GUARD(block_seek_nanos); |
11fdf7f2 | 418 | Slice seek_key = target; |
7c673cae FG |
419 | if (data_ == nullptr) { // Not init yet |
420 | return; | |
421 | } | |
422 | uint32_t index = 0; | |
11fdf7f2 TL |
423 | bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, |
424 | comparator_); | |
7c673cae FG |
425 | |
426 | if (!ok) { | |
427 | return; | |
428 | } | |
429 | SeekToRestartPoint(index); | |
11fdf7f2 | 430 | // Linear search (within restart block) for first key >= seek_key |
7c673cae | 431 | |
494da23a | 432 | while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) { |
7c673cae FG |
433 | } |
434 | if (!Valid()) { | |
435 | SeekToLast(); | |
436 | } else { | |
11fdf7f2 | 437 | while (Valid() && Compare(key_, seek_key) > 0) { |
7c673cae FG |
438 | Prev(); |
439 | } | |
440 | } | |
441 | } | |
442 | ||
11fdf7f2 | 443 | void DataBlockIter::SeekToFirst() { |
7c673cae FG |
444 | if (data_ == nullptr) { // Not init yet |
445 | return; | |
446 | } | |
447 | SeekToRestartPoint(0); | |
494da23a TL |
448 | ParseNextDataKey<DecodeEntry>(); |
449 | } | |
450 | ||
451 | void DataBlockIter::SeekToFirstOrReport() { | |
452 | if (data_ == nullptr) { // Not init yet | |
453 | return; | |
454 | } | |
455 | SeekToRestartPoint(0); | |
456 | ParseNextDataKey<CheckAndDecodeEntry>(); | |
7c673cae FG |
457 | } |
458 | ||
11fdf7f2 TL |
459 | void IndexBlockIter::SeekToFirst() { |
460 | if (data_ == nullptr) { // Not init yet | |
461 | return; | |
462 | } | |
463 | SeekToRestartPoint(0); | |
464 | ParseNextIndexKey(); | |
465 | } | |
466 | ||
467 | void DataBlockIter::SeekToLast() { | |
7c673cae FG |
468 | if (data_ == nullptr) { // Not init yet |
469 | return; | |
470 | } | |
471 | SeekToRestartPoint(num_restarts_ - 1); | |
494da23a | 472 | while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) { |
7c673cae FG |
473 | // Keep skipping |
474 | } | |
475 | } | |
476 | ||
11fdf7f2 TL |
477 | void IndexBlockIter::SeekToLast() { |
478 | if (data_ == nullptr) { // Not init yet | |
479 | return; | |
480 | } | |
481 | SeekToRestartPoint(num_restarts_ - 1); | |
482 | while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { | |
483 | // Keep skipping | |
484 | } | |
485 | } | |
486 | ||
487 | template <class TValue> | |
488 | void BlockIter<TValue>::CorruptionError() { | |
7c673cae FG |
489 | current_ = restarts_; |
490 | restart_index_ = num_restarts_; | |
491 | status_ = Status::Corruption("bad entry in block"); | |
492 | key_.Clear(); | |
493 | value_.clear(); | |
494 | } | |
495 | ||
494da23a | 496 | template <typename DecodeEntryFunc> |
11fdf7f2 | 497 | bool DataBlockIter::ParseNextDataKey(const char* limit) { |
7c673cae FG |
498 | current_ = NextEntryOffset(); |
499 | const char* p = data_ + current_; | |
11fdf7f2 TL |
500 | if (!limit) { |
501 | limit = data_ + restarts_; // Restarts come right after data | |
502 | } | |
503 | ||
7c673cae FG |
504 | if (p >= limit) { |
505 | // No more entries to return. Mark as invalid. | |
506 | current_ = restarts_; | |
507 | restart_index_ = num_restarts_; | |
508 | return false; | |
509 | } | |
510 | ||
511 | // Decode next entry | |
512 | uint32_t shared, non_shared, value_length; | |
494da23a | 513 | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); |
7c673cae FG |
514 | if (p == nullptr || key_.Size() < shared) { |
515 | CorruptionError(); | |
516 | return false; | |
517 | } else { | |
518 | if (shared == 0) { | |
519 | // If this key dont share any bytes with prev key then we dont need | |
520 | // to decode it and can use it's address in the block directly. | |
11fdf7f2 | 521 | key_.SetKey(Slice(p, non_shared), false /* copy */); |
7c673cae FG |
522 | key_pinned_ = true; |
523 | } else { | |
524 | // This key share `shared` bytes with prev key, we need to decode it | |
525 | key_.TrimAppend(shared, p, non_shared); | |
526 | key_pinned_ = false; | |
527 | } | |
528 | ||
529 | if (global_seqno_ != kDisableGlobalSequenceNumber) { | |
530 | // If we are reading a file with a global sequence number we should | |
11fdf7f2 TL |
531 | // expect that all encoded sequence numbers are zeros and any value |
532 | // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. | |
7c673cae | 533 | assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); |
11fdf7f2 TL |
534 | |
535 | ValueType value_type = ExtractValueType(key_.GetKey()); | |
536 | assert(value_type == ValueType::kTypeValue || | |
537 | value_type == ValueType::kTypeMerge || | |
538 | value_type == ValueType::kTypeDeletion || | |
539 | value_type == ValueType::kTypeRangeDeletion); | |
7c673cae FG |
540 | |
541 | if (key_pinned_) { | |
542 | // TODO(tec): Investigate updating the seqno in the loaded block | |
543 | // directly instead of doing a copy and update. | |
544 | ||
545 | // We cannot use the key address in the block directly because | |
546 | // we have a global_seqno_ that will overwrite the encoded one. | |
547 | key_.OwnKey(); | |
548 | key_pinned_ = false; | |
549 | } | |
550 | ||
11fdf7f2 | 551 | key_.UpdateInternalKey(global_seqno_, value_type); |
7c673cae FG |
552 | } |
553 | ||
554 | value_ = Slice(p + non_shared, value_length); | |
11fdf7f2 TL |
555 | if (shared == 0) { |
556 | while (restart_index_ + 1 < num_restarts_ && | |
557 | GetRestartPoint(restart_index_ + 1) < current_) { | |
558 | ++restart_index_; | |
559 | } | |
560 | } | |
561 | // else we are in the middle of a restart interval and the restart_index_ | |
562 | // thus has not changed | |
563 | return true; | |
564 | } | |
565 | } | |
566 | ||
567 | bool IndexBlockIter::ParseNextIndexKey() { | |
568 | current_ = NextEntryOffset(); | |
569 | const char* p = data_ + current_; | |
570 | const char* limit = data_ + restarts_; // Restarts come right after data | |
571 | if (p >= limit) { | |
572 | // No more entries to return. Mark as invalid. | |
573 | current_ = restarts_; | |
574 | restart_index_ = num_restarts_; | |
575 | return false; | |
576 | } | |
577 | ||
578 | // Decode next entry | |
579 | uint32_t shared, non_shared, value_length; | |
580 | if (value_delta_encoded_) { | |
581 | p = DecodeKeyV4()(p, limit, &shared, &non_shared); | |
582 | value_length = 0; | |
583 | } else { | |
584 | p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); | |
585 | } | |
586 | if (p == nullptr || key_.Size() < shared) { | |
587 | CorruptionError(); | |
588 | return false; | |
589 | } | |
590 | if (shared == 0) { | |
591 | // If this key dont share any bytes with prev key then we dont need | |
592 | // to decode it and can use it's address in the block directly. | |
593 | key_.SetKey(Slice(p, non_shared), false /* copy */); | |
594 | key_pinned_ = true; | |
595 | } else { | |
596 | // This key share `shared` bytes with prev key, we need to decode it | |
597 | key_.TrimAppend(shared, p, non_shared); | |
598 | key_pinned_ = false; | |
599 | } | |
600 | value_ = Slice(p + non_shared, value_length); | |
601 | if (shared == 0) { | |
7c673cae FG |
602 | while (restart_index_ + 1 < num_restarts_ && |
603 | GetRestartPoint(restart_index_ + 1) < current_) { | |
604 | ++restart_index_; | |
605 | } | |
11fdf7f2 TL |
606 | } |
607 | // else we are in the middle of a restart interval and the restart_index_ | |
608 | // thus has not changed | |
609 | if (value_delta_encoded_) { | |
610 | assert(value_length == 0); | |
611 | DecodeCurrentValue(shared); | |
612 | } | |
613 | return true; | |
614 | } | |
615 | ||
616 | // The format: | |
617 | // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
618 | // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
619 | // ... | |
620 | // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
621 | // where, k is key, v is value, and its encoding is in parenthesis. | |
622 | // The format of each key is (shared_size, non_shared_size, shared, non_shared) | |
623 | // The format of each value, i.e., block hanlde, is (offset, size) whenever the | |
624 | // shared_size is 0, which included the first entry in each restart point. | |
625 | // Otherwise the format is delta-size = block handle size - size of last block | |
626 | // handle. | |
627 | void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { | |
628 | assert(value_delta_encoded_); | |
629 | const char* limit = data_ + restarts_; | |
630 | if (shared == 0) { | |
631 | uint64_t o, s; | |
632 | const char* newp = GetVarint64Ptr(value_.data(), limit, &o); | |
633 | assert(newp); | |
634 | newp = GetVarint64Ptr(newp, limit, &s); | |
635 | assert(newp); | |
636 | decoded_value_ = BlockHandle(o, s); | |
637 | value_ = Slice(value_.data(), newp - value_.data()); | |
638 | } else { | |
639 | uint64_t next_value_base = | |
640 | decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; | |
641 | int64_t delta; | |
642 | const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); | |
643 | decoded_value_ = | |
644 | BlockHandle(next_value_base, decoded_value_.size() + delta); | |
645 | value_ = Slice(value_.data(), newp - value_.data()); | |
7c673cae FG |
646 | } |
647 | } | |
648 | ||
649 | // Binary search in restart array to find the first restart point that | |
650 | // is either the last restart point with a key less than target, | |
651 | // which means the key of next restart point is larger than target, or | |
652 | // the first restart point with a key = target | |
11fdf7f2 TL |
653 | template <class TValue> |
654 | template <typename DecodeKeyFunc> | |
655 | bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left, | |
656 | uint32_t right, uint32_t* index, | |
657 | const Comparator* comp) { | |
7c673cae FG |
658 | assert(left <= right); |
659 | ||
660 | while (left < right) { | |
661 | uint32_t mid = (left + right + 1) / 2; | |
662 | uint32_t region_offset = GetRestartPoint(mid); | |
11fdf7f2 TL |
663 | uint32_t shared, non_shared; |
664 | const char* key_ptr = DecodeKeyFunc()( | |
665 | data_ + region_offset, data_ + restarts_, &shared, &non_shared); | |
7c673cae FG |
666 | if (key_ptr == nullptr || (shared != 0)) { |
667 | CorruptionError(); | |
668 | return false; | |
669 | } | |
670 | Slice mid_key(key_ptr, non_shared); | |
11fdf7f2 | 671 | int cmp = comp->Compare(mid_key, target); |
7c673cae FG |
672 | if (cmp < 0) { |
673 | // Key at "mid" is smaller than "target". Therefore all | |
674 | // blocks before "mid" are uninteresting. | |
675 | left = mid; | |
676 | } else if (cmp > 0) { | |
677 | // Key at "mid" is >= "target". Therefore all blocks at or | |
678 | // after "mid" are uninteresting. | |
679 | right = mid - 1; | |
680 | } else { | |
681 | left = right = mid; | |
682 | } | |
683 | } | |
684 | ||
685 | *index = left; | |
686 | return true; | |
687 | } | |
688 | ||
689 | // Compare target key and the block key of the block of `block_index`. | |
690 | // Return -1 if error. | |
11fdf7f2 | 691 | int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
7c673cae | 692 | uint32_t region_offset = GetRestartPoint(block_index); |
11fdf7f2 TL |
693 | uint32_t shared, non_shared; |
694 | const char* key_ptr = | |
695 | value_delta_encoded_ | |
696 | ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, | |
697 | &non_shared) | |
698 | : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, | |
699 | &non_shared); | |
7c673cae FG |
700 | if (key_ptr == nullptr || (shared != 0)) { |
701 | CorruptionError(); | |
702 | return 1; // Return target is smaller | |
703 | } | |
704 | Slice block_key(key_ptr, non_shared); | |
705 | return Compare(block_key, target); | |
706 | } | |
707 | ||
708 | // Binary search in block_ids to find the first block | |
709 | // with a key >= target | |
11fdf7f2 TL |
710 | bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, |
711 | uint32_t* block_ids, uint32_t left, | |
712 | uint32_t right, uint32_t* index) { | |
7c673cae FG |
713 | assert(left <= right); |
714 | uint32_t left_bound = left; | |
715 | ||
716 | while (left <= right) { | |
717 | uint32_t mid = (right + left) / 2; | |
718 | ||
719 | int cmp = CompareBlockKey(block_ids[mid], target); | |
720 | if (!status_.ok()) { | |
721 | return false; | |
722 | } | |
723 | if (cmp < 0) { | |
724 | // Key at "target" is larger than "mid". Therefore all | |
725 | // blocks before or at "mid" are uninteresting. | |
726 | left = mid + 1; | |
727 | } else { | |
728 | // Key at "target" is <= "mid". Therefore all blocks | |
729 | // after "mid" are uninteresting. | |
730 | // If there is only one block left, we found it. | |
731 | if (left == right) break; | |
732 | right = mid; | |
733 | } | |
734 | } | |
735 | ||
736 | if (left == right) { | |
737 | // In one of the two following cases: | |
738 | // (1) left is the first one of block_ids | |
739 | // (2) there is a gap of blocks between block of `left` and `left-1`. | |
740 | // we can further distinguish the case of key in the block or key not | |
741 | // existing, by comparing the target key and the key of the previous | |
742 | // block to the left of the block found. | |
743 | if (block_ids[left] > 0 && | |
744 | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && | |
745 | CompareBlockKey(block_ids[left] - 1, target) > 0) { | |
746 | current_ = restarts_; | |
747 | return false; | |
748 | } | |
749 | ||
750 | *index = block_ids[left]; | |
751 | return true; | |
752 | } else { | |
753 | assert(left > right); | |
754 | // Mark iterator invalid | |
755 | current_ = restarts_; | |
756 | return false; | |
757 | } | |
758 | } | |
759 | ||
11fdf7f2 | 760 | bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { |
7c673cae | 761 | assert(prefix_index_); |
11fdf7f2 TL |
762 | Slice seek_key = target; |
763 | if (!key_includes_seq_) { | |
764 | seek_key = ExtractUserKey(target); | |
765 | } | |
7c673cae FG |
766 | uint32_t* block_ids = nullptr; |
767 | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); | |
768 | ||
769 | if (num_blocks == 0) { | |
770 | current_ = restarts_; | |
771 | return false; | |
494da23a | 772 | } else { |
11fdf7f2 | 773 | return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); |
7c673cae FG |
774 | } |
775 | } | |
776 | ||
777 | uint32_t Block::NumRestarts() const { | |
494da23a | 778 | assert(size_ >= 2 * sizeof(uint32_t)); |
11fdf7f2 TL |
779 | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
780 | uint32_t num_restarts = block_footer; | |
781 | if (size_ > kMaxBlockSizeSupportedByHashIndex) { | |
782 | // In BlockBuilder, we have ensured a block with HashIndex is less than | |
783 | // kMaxBlockSizeSupportedByHashIndex (64KiB). | |
784 | // | |
785 | // Therefore, if we encounter a block with a size > 64KiB, the block | |
786 | // cannot have HashIndex. So the footer will directly interpreted as | |
787 | // num_restarts. | |
788 | // | |
789 | // Such check is for backward compatibility. We can ensure legacy block | |
790 | // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted | |
791 | // correctly as no HashIndex even if the MSB of num_restarts is set. | |
792 | return num_restarts; | |
793 | } | |
794 | BlockBasedTableOptions::DataBlockIndexType index_type; | |
795 | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); | |
796 | return num_restarts; | |
797 | } | |
798 | ||
799 | BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { | |
800 | assert(size_ >= 2 * sizeof(uint32_t)); | |
801 | if (size_ > kMaxBlockSizeSupportedByHashIndex) { | |
802 | // The check is for the same reason as that in NumRestarts() | |
803 | return BlockBasedTableOptions::kDataBlockBinarySearch; | |
804 | } | |
805 | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); | |
806 | uint32_t num_restarts = block_footer; | |
807 | BlockBasedTableOptions::DataBlockIndexType index_type; | |
808 | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); | |
809 | return index_type; | |
810 | } | |
811 | ||
812 | Block::~Block() { | |
813 | // This sync point can be re-enabled if RocksDB can control the | |
814 | // initialization order of any/all static options created by the user. | |
815 | // TEST_SYNC_POINT("Block::~Block"); | |
7c673cae FG |
816 | } |
817 | ||
818 | Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, | |
819 | size_t read_amp_bytes_per_bit, Statistics* statistics) | |
820 | : contents_(std::move(contents)), | |
821 | data_(contents_.data.data()), | |
822 | size_(contents_.data.size()), | |
11fdf7f2 TL |
823 | restart_offset_(0), |
824 | num_restarts_(0), | |
7c673cae | 825 | global_seqno_(_global_seqno) { |
11fdf7f2 | 826 | TEST_SYNC_POINT("Block::Block:0"); |
7c673cae FG |
827 | if (size_ < sizeof(uint32_t)) { |
828 | size_ = 0; // Error marker | |
829 | } else { | |
11fdf7f2 | 830 | // Should only decode restart points for uncompressed blocks |
494da23a TL |
831 | num_restarts_ = NumRestarts(); |
832 | switch (IndexType()) { | |
833 | case BlockBasedTableOptions::kDataBlockBinarySearch: | |
834 | restart_offset_ = static_cast<uint32_t>(size_) - | |
835 | (1 + num_restarts_) * sizeof(uint32_t); | |
836 | if (restart_offset_ > size_ - sizeof(uint32_t)) { | |
837 | // The size is too small for NumRestarts() and therefore | |
838 | // restart_offset_ wrapped around. | |
839 | size_ = 0; | |
840 | } | |
841 | break; | |
842 | case BlockBasedTableOptions::kDataBlockBinaryAndHash: | |
843 | if (size_ < sizeof(uint32_t) /* block footer */ + | |
844 | sizeof(uint16_t) /* NUM_BUCK */) { | |
845 | size_ = 0; | |
11fdf7f2 | 846 | break; |
494da23a TL |
847 | } |
848 | ||
849 | uint16_t map_offset; | |
850 | data_block_hash_index_.Initialize( | |
851 | contents.data.data(), | |
852 | static_cast<uint16_t>(contents.data.size() - | |
853 | sizeof(uint32_t)), /*chop off | |
854 | NUM_RESTARTS*/ | |
855 | &map_offset); | |
856 | ||
857 | restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); | |
858 | ||
859 | if (restart_offset_ > map_offset) { | |
860 | // map_offset is too small for NumRestarts() and | |
861 | // therefore restart_offset_ wrapped around. | |
862 | size_ = 0; | |
11fdf7f2 | 863 | break; |
494da23a TL |
864 | } |
865 | break; | |
866 | default: | |
867 | size_ = 0; // Error marker | |
7c673cae FG |
868 | } |
869 | } | |
870 | if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { | |
871 | read_amp_bitmap_.reset(new BlockReadAmpBitmap( | |
872 | restart_offset_, read_amp_bytes_per_bit, statistics)); | |
873 | } | |
874 | } | |
875 | ||
11fdf7f2 TL |
876 | template <> |
877 | DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, | |
878 | DataBlockIter* iter, Statistics* stats, | |
879 | bool /*total_order_seek*/, | |
880 | bool /*key_includes_seq*/, | |
881 | bool /*value_is_full*/, | |
494da23a | 882 | bool block_contents_pinned, |
11fdf7f2 TL |
883 | BlockPrefixIndex* /*prefix_index*/) { |
884 | DataBlockIter* ret_iter; | |
885 | if (iter != nullptr) { | |
886 | ret_iter = iter; | |
887 | } else { | |
888 | ret_iter = new DataBlockIter; | |
7c673cae | 889 | } |
11fdf7f2 TL |
890 | if (size_ < 2 * sizeof(uint32_t)) { |
891 | ret_iter->Invalidate(Status::Corruption("bad block contents")); | |
892 | return ret_iter; | |
893 | } | |
894 | if (num_restarts_ == 0) { | |
895 | // Empty block. | |
896 | ret_iter->Invalidate(Status::OK()); | |
897 | return ret_iter; | |
7c673cae | 898 | } else { |
11fdf7f2 TL |
899 | ret_iter->Initialize( |
900 | cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, | |
494da23a | 901 | read_amp_bitmap_.get(), block_contents_pinned, |
11fdf7f2 | 902 | data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); |
7c673cae FG |
903 | if (read_amp_bitmap_) { |
904 | if (read_amp_bitmap_->GetStatistics() != stats) { | |
905 | // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ | |
906 | read_amp_bitmap_->SetStatistics(stats); | |
907 | } | |
908 | } | |
909 | } | |
910 | ||
11fdf7f2 | 911 | return ret_iter; |
7c673cae FG |
912 | } |
913 | ||
11fdf7f2 TL |
914 | template <> |
915 | IndexBlockIter* Block::NewIterator(const Comparator* cmp, | |
916 | const Comparator* ucmp, IndexBlockIter* iter, | |
917 | Statistics* /*stats*/, bool total_order_seek, | |
918 | bool key_includes_seq, bool value_is_full, | |
494da23a | 919 | bool block_contents_pinned, |
11fdf7f2 TL |
920 | BlockPrefixIndex* prefix_index) { |
921 | IndexBlockIter* ret_iter; | |
922 | if (iter != nullptr) { | |
923 | ret_iter = iter; | |
924 | } else { | |
925 | ret_iter = new IndexBlockIter; | |
926 | } | |
927 | if (size_ < 2 * sizeof(uint32_t)) { | |
928 | ret_iter->Invalidate(Status::Corruption("bad block contents")); | |
929 | return ret_iter; | |
930 | } | |
931 | if (num_restarts_ == 0) { | |
932 | // Empty block. | |
933 | ret_iter->Invalidate(Status::OK()); | |
934 | return ret_iter; | |
935 | } else { | |
936 | BlockPrefixIndex* prefix_index_ptr = | |
937 | total_order_seek ? nullptr : prefix_index; | |
938 | ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, | |
939 | prefix_index_ptr, key_includes_seq, value_is_full, | |
494da23a TL |
940 | block_contents_pinned, |
941 | nullptr /* data_block_hash_index */); | |
11fdf7f2 TL |
942 | } |
943 | ||
944 | return ret_iter; | |
7c673cae FG |
945 | } |
946 | ||
947 | size_t Block::ApproximateMemoryUsage() const { | |
948 | size_t usage = usable_size(); | |
11fdf7f2 TL |
949 | #ifdef ROCKSDB_MALLOC_USABLE_SIZE |
950 | usage += malloc_usable_size((void*)this); | |
951 | #else | |
952 | usage += sizeof(*this); | |
953 | #endif // ROCKSDB_MALLOC_USABLE_SIZE | |
954 | if (read_amp_bitmap_) { | |
955 | usage += read_amp_bitmap_->ApproximateMemoryUsage(); | |
7c673cae FG |
956 | } |
957 | return usage; | |
958 | } | |
959 | ||
960 | } // namespace rocksdb |