]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | // | |
10 | // Decodes the blocks generated by block_builder.cc. | |
11 | ||
f67539c2 | 12 | #include "table/block_based/block.h" |
7c673cae FG |
13 | #include <algorithm> |
14 | #include <string> | |
15 | #include <unordered_map> | |
16 | #include <vector> | |
17 | ||
18 | #include "monitoring/perf_context_imp.h" | |
19 | #include "port/port.h" | |
20 | #include "port/stack_trace.h" | |
21 | #include "rocksdb/comparator.h" | |
f67539c2 TL |
22 | #include "table/block_based/block_prefix_index.h" |
23 | #include "table/block_based/data_block_footer.h" | |
7c673cae FG |
24 | #include "table/format.h" |
25 | #include "util/coding.h" | |
7c673cae | 26 | |
f67539c2 | 27 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
28 | |
29 | // Helper routine: decode the next block entry starting at "p", | |
30 | // storing the number of shared key bytes, non_shared key bytes, | |
31 | // and the length of the value in "*shared", "*non_shared", and | |
32 | // "*value_length", respectively. Will not derefence past "limit". | |
33 | // | |
34 | // If any errors are detected, returns nullptr. Otherwise, returns a | |
35 | // pointer to the key delta (just past the three decoded values). | |
11fdf7f2 TL |
36 | struct DecodeEntry { |
37 | inline const char* operator()(const char* p, const char* limit, | |
38 | uint32_t* shared, uint32_t* non_shared, | |
39 | uint32_t* value_length) { | |
40 | // We need 2 bytes for shared and non_shared size. We also need one more | |
41 | // byte either for value size or the actual value in case of value delta | |
42 | // encoding. | |
43 | assert(limit - p >= 3); | |
44 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
45 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
46 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; | |
47 | if ((*shared | *non_shared | *value_length) < 128) { | |
48 | // Fast path: all three values are encoded in one byte each | |
49 | p += 3; | |
50 | } else { | |
51 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
52 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
53 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { | |
54 | return nullptr; | |
55 | } | |
56 | } | |
57 | ||
58 | // Using an assert in place of "return null" since we should not pay the | |
59 | // cost of checking for corruption on every single key decoding | |
60 | assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length))); | |
61 | return p; | |
7c673cae | 62 | } |
11fdf7f2 | 63 | }; |
7c673cae | 64 | |
494da23a TL |
65 | // Helper routine: similar to DecodeEntry but does not have assertions. |
66 | // Instead, returns nullptr so that caller can detect and report failure. | |
67 | struct CheckAndDecodeEntry { | |
68 | inline const char* operator()(const char* p, const char* limit, | |
69 | uint32_t* shared, uint32_t* non_shared, | |
70 | uint32_t* value_length) { | |
71 | // We need 2 bytes for shared and non_shared size. We also need one more | |
72 | // byte either for value size or the actual value in case of value delta | |
73 | // encoding. | |
74 | if (limit - p < 3) { | |
75 | return nullptr; | |
76 | } | |
77 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
78 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
79 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; | |
80 | if ((*shared | *non_shared | *value_length) < 128) { | |
81 | // Fast path: all three values are encoded in one byte each | |
82 | p += 3; | |
83 | } else { | |
84 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
85 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
86 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { | |
87 | return nullptr; | |
88 | } | |
89 | } | |
90 | ||
91 | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { | |
92 | return nullptr; | |
93 | } | |
94 | return p; | |
95 | } | |
96 | }; | |
97 | ||
11fdf7f2 TL |
98 | struct DecodeKey { |
99 | inline const char* operator()(const char* p, const char* limit, | |
100 | uint32_t* shared, uint32_t* non_shared) { | |
101 | uint32_t value_length; | |
102 | return DecodeEntry()(p, limit, shared, non_shared, &value_length); | |
7c673cae | 103 | } |
11fdf7f2 TL |
104 | }; |
105 | ||
106 | // In format_version 4, which is used by index blocks, the value size is not | |
107 | // encoded before the entry, as the value is known to be the handle with the | |
108 | // known size. | |
109 | struct DecodeKeyV4 { | |
110 | inline const char* operator()(const char* p, const char* limit, | |
111 | uint32_t* shared, uint32_t* non_shared) { | |
112 | // We need 2 bytes for shared and non_shared size. We also need one more | |
113 | // byte either for value size or the actual value in case of value delta | |
114 | // encoding. | |
115 | if (limit - p < 3) return nullptr; | |
116 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; | |
117 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |
118 | if ((*shared | *non_shared) < 128) { | |
119 | // Fast path: all three values are encoded in one byte each | |
120 | p += 2; | |
121 | } else { | |
122 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; | |
123 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; | |
124 | } | |
125 | return p; | |
126 | } | |
127 | }; | |
128 | ||
20effc67 | 129 | void DataBlockIter::NextImpl() { ParseNextDataKey<DecodeEntry>(); } |
494da23a | 130 | |
20effc67 | 131 | void DataBlockIter::NextOrReportImpl() { |
494da23a | 132 | ParseNextDataKey<CheckAndDecodeEntry>(); |
7c673cae FG |
133 | } |
134 | ||
20effc67 | 135 | void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } |
7c673cae | 136 | |
20effc67 | 137 | void IndexBlockIter::PrevImpl() { |
11fdf7f2 TL |
138 | assert(Valid()); |
139 | // Scan backwards to a restart point before current_ | |
140 | const uint32_t original = current_; | |
141 | while (GetRestartPoint(restart_index_) >= original) { | |
142 | if (restart_index_ == 0) { | |
143 | // No more entries | |
144 | current_ = restarts_; | |
145 | restart_index_ = num_restarts_; | |
146 | return; | |
147 | } | |
148 | restart_index_--; | |
149 | } | |
150 | SeekToRestartPoint(restart_index_); | |
f67539c2 TL |
151 | // Loop until end of current entry hits the start of original entry |
152 | while (ParseNextIndexKey() && NextEntryOffset() < original) { | |
153 | } | |
11fdf7f2 TL |
154 | } |
155 | ||
20effc67 TL |
156 | // Similar to IndexBlockIter::PrevImpl but also caches the prev entries |
157 | void DataBlockIter::PrevImpl() { | |
7c673cae FG |
158 | assert(Valid()); |
159 | ||
160 | assert(prev_entries_idx_ == -1 || | |
161 | static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); | |
162 | // Check if we can use cached prev_entries_ | |
163 | if (prev_entries_idx_ > 0 && | |
164 | prev_entries_[prev_entries_idx_].offset == current_) { | |
165 | // Read cached CachedPrevEntry | |
166 | prev_entries_idx_--; | |
167 | const CachedPrevEntry& current_prev_entry = | |
168 | prev_entries_[prev_entries_idx_]; | |
169 | ||
170 | const char* key_ptr = nullptr; | |
20effc67 | 171 | bool raw_key_cached; |
7c673cae FG |
172 | if (current_prev_entry.key_ptr != nullptr) { |
173 | // The key is not delta encoded and stored in the data block | |
174 | key_ptr = current_prev_entry.key_ptr; | |
20effc67 | 175 | raw_key_cached = false; |
7c673cae FG |
176 | } else { |
177 | // The key is delta encoded and stored in prev_entries_keys_buff_ | |
178 | key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; | |
20effc67 | 179 | raw_key_cached = true; |
7c673cae FG |
180 | } |
181 | const Slice current_key(key_ptr, current_prev_entry.key_size); | |
182 | ||
183 | current_ = current_prev_entry.offset; | |
20effc67 TL |
184 | // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, |
185 | // not necessity. It is convenient since this class treats keys as pinned | |
186 | // when `raw_key_` points to an outside buffer. So we cannot allow | |
187 | // `raw_key_` point into Prev cache as it is a transient outside buffer | |
188 | // (i.e., keys in it are not actually pinned). | |
189 | raw_key_.SetKey(current_key, raw_key_cached /* copy */); | |
7c673cae FG |
190 | value_ = current_prev_entry.value; |
191 | ||
192 | return; | |
193 | } | |
194 | ||
195 | // Clear prev entries cache | |
196 | prev_entries_idx_ = -1; | |
197 | prev_entries_.clear(); | |
198 | prev_entries_keys_buff_.clear(); | |
199 | ||
200 | // Scan backwards to a restart point before current_ | |
201 | const uint32_t original = current_; | |
202 | while (GetRestartPoint(restart_index_) >= original) { | |
203 | if (restart_index_ == 0) { | |
204 | // No more entries | |
205 | current_ = restarts_; | |
206 | restart_index_ = num_restarts_; | |
207 | return; | |
208 | } | |
209 | restart_index_--; | |
210 | } | |
211 | ||
212 | SeekToRestartPoint(restart_index_); | |
213 | ||
214 | do { | |
494da23a | 215 | if (!ParseNextDataKey<DecodeEntry>()) { |
7c673cae FG |
216 | break; |
217 | } | |
20effc67 | 218 | Slice current_key = raw_key_.GetKey(); |
7c673cae | 219 | |
20effc67 | 220 | if (raw_key_.IsKeyPinned()) { |
7c673cae FG |
221 | // The key is not delta encoded |
222 | prev_entries_.emplace_back(current_, current_key.data(), 0, | |
223 | current_key.size(), value()); | |
224 | } else { | |
225 | // The key is delta encoded, cache decoded key in buffer | |
226 | size_t new_key_offset = prev_entries_keys_buff_.size(); | |
227 | prev_entries_keys_buff_.append(current_key.data(), current_key.size()); | |
228 | ||
229 | prev_entries_.emplace_back(current_, nullptr, new_key_offset, | |
230 | current_key.size(), value()); | |
231 | } | |
232 | // Loop until end of current entry hits the start of original entry | |
233 | } while (NextEntryOffset() < original); | |
234 | prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; | |
235 | } | |
236 | ||
20effc67 | 237 | void DataBlockIter::SeekImpl(const Slice& target) { |
11fdf7f2 TL |
238 | Slice seek_key = target; |
239 | PERF_TIMER_GUARD(block_seek_nanos); | |
240 | if (data_ == nullptr) { // Not init yet | |
241 | return; | |
242 | } | |
243 | uint32_t index = 0; | |
20effc67 TL |
244 | bool skip_linear_scan = false; |
245 | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); | |
11fdf7f2 TL |
246 | |
247 | if (!ok) { | |
248 | return; | |
249 | } | |
20effc67 | 250 | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
11fdf7f2 TL |
251 | } |
252 | ||
253 | // Optimized Seek for point lookup for an internal key `target` | |
254 | // target = "seek_user_key @ type | seqno". | |
255 | // | |
256 | // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, | |
257 | // or kTypeBlobIndex, this function behaves identically as Seek(). | |
258 | // | |
259 | // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, | |
260 | // or kTypeBlobIndex: | |
261 | // | |
262 | // If the return value is FALSE, iter location is undefined, and it means: | |
263 | // 1) there is no key in this block falling into the range: | |
264 | // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], | |
265 | // inclusive; AND | |
266 | // 2) the last key of this block has a greater user_key from seek_user_key | |
267 | // | |
268 | // If the return value is TRUE, iter location has two possibilies: | |
269 | // 1) If iter is valid, it is set to a location as if set by BinarySeek. In | |
20effc67 TL |
270 | // this case, it points to the first key with a larger user_key or a matching |
271 | // user_key with a seqno no greater than the seeking seqno. | |
11fdf7f2 TL |
272 | // 2) If the iter is invalid, it means that either all the user_key is less |
273 | // than the seek_user_key, or the block ends with a matching user_key but | |
274 | // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno | |
275 | // but larger type). | |
276 | bool DataBlockIter::SeekForGetImpl(const Slice& target) { | |
f67539c2 | 277 | Slice target_user_key = ExtractUserKey(target); |
11fdf7f2 | 278 | uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); |
f67539c2 TL |
279 | uint8_t entry = |
280 | data_block_hash_index_->Lookup(data_, map_offset, target_user_key); | |
11fdf7f2 TL |
281 | |
282 | if (entry == kCollision) { | |
283 | // HashSeek not effective, falling back | |
20effc67 | 284 | SeekImpl(target); |
11fdf7f2 TL |
285 | return true; |
286 | } | |
287 | ||
288 | if (entry == kNoEntry) { | |
289 | // Even if we cannot find the user_key in this block, the result may | |
20effc67 | 290 | // exist in the next block. Consider this example: |
11fdf7f2 TL |
291 | // |
292 | // Block N: [aab@100, ... , app@120] | |
20effc67 | 293 | // boundary key: axy@50 (we make minimal assumption about a boundary key) |
11fdf7f2 TL |
294 | // Block N+1: [axy@10, ... ] |
295 | // | |
296 | // If seek_key = axy@60, the search will starts from Block N. | |
297 | // Even if the user_key is not found in the hash map, the caller still | |
20effc67 | 298 | // have to continue searching the next block. |
11fdf7f2 TL |
299 | // |
300 | // In this case, we pretend the key is the the last restart interval. | |
301 | // The while-loop below will search the last restart interval for the | |
302 | // key. It will stop at the first key that is larger than the seek_key, | |
303 | // or to the end of the block if no one is larger. | |
304 | entry = static_cast<uint8_t>(num_restarts_ - 1); | |
305 | } | |
306 | ||
307 | uint32_t restart_index = entry; | |
308 | ||
309 | // check if the key is in the restart_interval | |
310 | assert(restart_index < num_restarts_); | |
311 | SeekToRestartPoint(restart_index); | |
312 | ||
313 | const char* limit = nullptr; | |
314 | if (restart_index_ + 1 < num_restarts_) { | |
315 | limit = data_ + GetRestartPoint(restart_index_ + 1); | |
316 | } else { | |
317 | limit = data_ + restarts_; | |
318 | } | |
319 | ||
320 | while (true) { | |
321 | // Here we only linear seek the target key inside the restart interval. | |
322 | // If a key does not exist inside a restart interval, we avoid | |
323 | // further searching the block content accross restart interval boundary. | |
324 | // | |
325 | // TODO(fwu): check the left and write boundary of the restart interval | |
326 | // to avoid linear seek a target key that is out of range. | |
20effc67 TL |
327 | if (!ParseNextDataKey<DecodeEntry>(limit) || |
328 | CompareCurrentKey(target) >= 0) { | |
11fdf7f2 TL |
329 | // we stop at the first potential matching user key. |
330 | break; | |
331 | } | |
332 | } | |
333 | ||
334 | if (current_ == restarts_) { | |
335 | // Search reaches to the end of the block. There are three possibilites: | |
336 | // 1) there is only one user_key match in the block (otherwise collsion). | |
337 | // the matching user_key resides in the last restart interval, and it | |
338 | // is the last key of the restart interval and of the block as well. | |
339 | // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. | |
340 | // | |
341 | // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, | |
342 | // AND all existing user_keys in the restart interval are smaller than | |
343 | // seek_user_key. | |
344 | // | |
345 | // 3) The seek_key is a false positive and happens to be hashed to the | |
346 | // last restart interval, AND all existing user_keys in the restart | |
347 | // interval are smaller than seek_user_key. | |
348 | // | |
349 | // The result may exist in the next block each case, so we return true. | |
350 | return true; | |
351 | } | |
352 | ||
20effc67 | 353 | if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) { |
11fdf7f2 TL |
354 | // the key is not in this block and cannot be at the next block either. |
355 | return false; | |
356 | } | |
357 | ||
358 | // Here we are conservative and only support a limited set of cases | |
20effc67 | 359 | ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); |
11fdf7f2 TL |
360 | if (value_type != ValueType::kTypeValue && |
361 | value_type != ValueType::kTypeDeletion && | |
362 | value_type != ValueType::kTypeSingleDeletion && | |
363 | value_type != ValueType::kTypeBlobIndex) { | |
20effc67 | 364 | SeekImpl(target); |
11fdf7f2 TL |
365 | return true; |
366 | } | |
367 | ||
368 | // Result found, and the iter is correctly set. | |
369 | return true; | |
370 | } | |
371 | ||
20effc67 | 372 | void IndexBlockIter::SeekImpl(const Slice& target) { |
f67539c2 | 373 | TEST_SYNC_POINT("IndexBlockIter::Seek:0"); |
7c673cae FG |
374 | PERF_TIMER_GUARD(block_seek_nanos); |
375 | if (data_ == nullptr) { // Not init yet | |
376 | return; | |
377 | } | |
20effc67 TL |
378 | Slice seek_key = target; |
379 | if (raw_key_.IsUserKey()) { | |
380 | seek_key = ExtractUserKey(target); | |
381 | } | |
f67539c2 | 382 | status_ = Status::OK(); |
7c673cae | 383 | uint32_t index = 0; |
20effc67 | 384 | bool skip_linear_scan = false; |
7c673cae FG |
385 | bool ok = false; |
386 | if (prefix_index_) { | |
f67539c2 TL |
387 | bool prefix_may_exist = true; |
388 | ok = PrefixSeek(target, &index, &prefix_may_exist); | |
389 | if (!prefix_may_exist) { | |
390 | // This is to let the caller to distinguish between non-existing prefix, | |
391 | // and when key is larger than the last key, which both set Valid() to | |
392 | // false. | |
393 | current_ = restarts_; | |
394 | status_ = Status::NotFound(); | |
395 | } | |
20effc67 TL |
396 | // restart interval must be one when hash search is enabled so the binary |
397 | // search simply lands at the right place. | |
398 | skip_linear_scan = true; | |
11fdf7f2 | 399 | } else if (value_delta_encoded_) { |
20effc67 | 400 | ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan); |
7c673cae | 401 | } else { |
20effc67 | 402 | ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
7c673cae FG |
403 | } |
404 | ||
405 | if (!ok) { | |
406 | return; | |
407 | } | |
20effc67 | 408 | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
7c673cae FG |
409 | } |
410 | ||
20effc67 | 411 | void DataBlockIter::SeekForPrevImpl(const Slice& target) { |
7c673cae | 412 | PERF_TIMER_GUARD(block_seek_nanos); |
11fdf7f2 | 413 | Slice seek_key = target; |
7c673cae FG |
414 | if (data_ == nullptr) { // Not init yet |
415 | return; | |
416 | } | |
417 | uint32_t index = 0; | |
20effc67 TL |
418 | bool skip_linear_scan = false; |
419 | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); | |
7c673cae FG |
420 | |
421 | if (!ok) { | |
422 | return; | |
423 | } | |
20effc67 | 424 | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
7c673cae | 425 | |
7c673cae | 426 | if (!Valid()) { |
20effc67 | 427 | SeekToLastImpl(); |
7c673cae | 428 | } else { |
20effc67 TL |
429 | while (Valid() && CompareCurrentKey(seek_key) > 0) { |
430 | PrevImpl(); | |
7c673cae FG |
431 | } |
432 | } | |
433 | } | |
434 | ||
20effc67 | 435 | void DataBlockIter::SeekToFirstImpl() { |
7c673cae FG |
436 | if (data_ == nullptr) { // Not init yet |
437 | return; | |
438 | } | |
439 | SeekToRestartPoint(0); | |
494da23a TL |
440 | ParseNextDataKey<DecodeEntry>(); |
441 | } | |
442 | ||
20effc67 | 443 | void DataBlockIter::SeekToFirstOrReportImpl() { |
494da23a TL |
444 | if (data_ == nullptr) { // Not init yet |
445 | return; | |
446 | } | |
447 | SeekToRestartPoint(0); | |
448 | ParseNextDataKey<CheckAndDecodeEntry>(); | |
7c673cae FG |
449 | } |
450 | ||
20effc67 | 451 | void IndexBlockIter::SeekToFirstImpl() { |
11fdf7f2 TL |
452 | if (data_ == nullptr) { // Not init yet |
453 | return; | |
454 | } | |
f67539c2 | 455 | status_ = Status::OK(); |
11fdf7f2 TL |
456 | SeekToRestartPoint(0); |
457 | ParseNextIndexKey(); | |
458 | } | |
459 | ||
20effc67 | 460 | void DataBlockIter::SeekToLastImpl() { |
7c673cae FG |
461 | if (data_ == nullptr) { // Not init yet |
462 | return; | |
463 | } | |
464 | SeekToRestartPoint(num_restarts_ - 1); | |
494da23a | 465 | while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) { |
7c673cae FG |
466 | // Keep skipping |
467 | } | |
468 | } | |
469 | ||
20effc67 | 470 | void IndexBlockIter::SeekToLastImpl() { |
11fdf7f2 TL |
471 | if (data_ == nullptr) { // Not init yet |
472 | return; | |
473 | } | |
f67539c2 | 474 | status_ = Status::OK(); |
11fdf7f2 TL |
475 | SeekToRestartPoint(num_restarts_ - 1); |
476 | while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { | |
477 | // Keep skipping | |
478 | } | |
479 | } | |
480 | ||
481 | template <class TValue> | |
482 | void BlockIter<TValue>::CorruptionError() { | |
7c673cae FG |
483 | current_ = restarts_; |
484 | restart_index_ = num_restarts_; | |
485 | status_ = Status::Corruption("bad entry in block"); | |
20effc67 | 486 | raw_key_.Clear(); |
7c673cae FG |
487 | value_.clear(); |
488 | } | |
489 | ||
494da23a | 490 | template <typename DecodeEntryFunc> |
11fdf7f2 | 491 | bool DataBlockIter::ParseNextDataKey(const char* limit) { |
7c673cae FG |
492 | current_ = NextEntryOffset(); |
493 | const char* p = data_ + current_; | |
11fdf7f2 TL |
494 | if (!limit) { |
495 | limit = data_ + restarts_; // Restarts come right after data | |
496 | } | |
497 | ||
7c673cae FG |
498 | if (p >= limit) { |
499 | // No more entries to return. Mark as invalid. | |
500 | current_ = restarts_; | |
501 | restart_index_ = num_restarts_; | |
502 | return false; | |
503 | } | |
504 | ||
505 | // Decode next entry | |
506 | uint32_t shared, non_shared, value_length; | |
494da23a | 507 | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); |
20effc67 | 508 | if (p == nullptr || raw_key_.Size() < shared) { |
7c673cae FG |
509 | CorruptionError(); |
510 | return false; | |
511 | } else { | |
512 | if (shared == 0) { | |
20effc67 TL |
513 | // If this key doesn't share any bytes with prev key then we don't need |
514 | // to decode it and can use its address in the block directly. | |
515 | raw_key_.SetKey(Slice(p, non_shared), false /* copy */); | |
7c673cae FG |
516 | } else { |
517 | // This key share `shared` bytes with prev key, we need to decode it | |
20effc67 | 518 | raw_key_.TrimAppend(shared, p, non_shared); |
7c673cae FG |
519 | } |
520 | ||
20effc67 | 521 | #ifndef NDEBUG |
7c673cae FG |
522 | if (global_seqno_ != kDisableGlobalSequenceNumber) { |
523 | // If we are reading a file with a global sequence number we should | |
11fdf7f2 TL |
524 | // expect that all encoded sequence numbers are zeros and any value |
525 | // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. | |
20effc67 TL |
526 | uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); |
527 | SequenceNumber seqno; | |
528 | ValueType value_type; | |
529 | UnPackSequenceAndType(packed, &seqno, &value_type); | |
11fdf7f2 TL |
530 | assert(value_type == ValueType::kTypeValue || |
531 | value_type == ValueType::kTypeMerge || | |
532 | value_type == ValueType::kTypeDeletion || | |
533 | value_type == ValueType::kTypeRangeDeletion); | |
20effc67 | 534 | assert(seqno == 0); |
7c673cae | 535 | } |
20effc67 | 536 | #endif // NDEBUG |
7c673cae FG |
537 | |
538 | value_ = Slice(p + non_shared, value_length); | |
11fdf7f2 TL |
539 | if (shared == 0) { |
540 | while (restart_index_ + 1 < num_restarts_ && | |
541 | GetRestartPoint(restart_index_ + 1) < current_) { | |
542 | ++restart_index_; | |
543 | } | |
544 | } | |
545 | // else we are in the middle of a restart interval and the restart_index_ | |
546 | // thus has not changed | |
547 | return true; | |
548 | } | |
549 | } | |
550 | ||
551 | bool IndexBlockIter::ParseNextIndexKey() { | |
552 | current_ = NextEntryOffset(); | |
553 | const char* p = data_ + current_; | |
554 | const char* limit = data_ + restarts_; // Restarts come right after data | |
555 | if (p >= limit) { | |
556 | // No more entries to return. Mark as invalid. | |
557 | current_ = restarts_; | |
558 | restart_index_ = num_restarts_; | |
559 | return false; | |
560 | } | |
561 | ||
562 | // Decode next entry | |
563 | uint32_t shared, non_shared, value_length; | |
564 | if (value_delta_encoded_) { | |
565 | p = DecodeKeyV4()(p, limit, &shared, &non_shared); | |
566 | value_length = 0; | |
567 | } else { | |
568 | p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); | |
569 | } | |
20effc67 | 570 | if (p == nullptr || raw_key_.Size() < shared) { |
11fdf7f2 TL |
571 | CorruptionError(); |
572 | return false; | |
573 | } | |
574 | if (shared == 0) { | |
20effc67 TL |
575 | // If this key doesn't share any bytes with prev key then we don't need |
576 | // to decode it and can use its address in the block directly. | |
577 | raw_key_.SetKey(Slice(p, non_shared), false /* copy */); | |
11fdf7f2 TL |
578 | } else { |
579 | // This key share `shared` bytes with prev key, we need to decode it | |
20effc67 | 580 | raw_key_.TrimAppend(shared, p, non_shared); |
11fdf7f2 TL |
581 | } |
582 | value_ = Slice(p + non_shared, value_length); | |
583 | if (shared == 0) { | |
7c673cae FG |
584 | while (restart_index_ + 1 < num_restarts_ && |
585 | GetRestartPoint(restart_index_ + 1) < current_) { | |
586 | ++restart_index_; | |
587 | } | |
11fdf7f2 TL |
588 | } |
589 | // else we are in the middle of a restart interval and the restart_index_ | |
590 | // thus has not changed | |
f67539c2 | 591 | if (value_delta_encoded_ || global_seqno_state_ != nullptr) { |
11fdf7f2 TL |
592 | DecodeCurrentValue(shared); |
593 | } | |
594 | return true; | |
595 | } | |
596 | ||
597 | // The format: | |
598 | // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
599 | // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
600 | // ... | |
601 | // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) | |
602 | // where, k is key, v is value, and its encoding is in parenthesis. | |
603 | // The format of each key is (shared_size, non_shared_size, shared, non_shared) | |
20effc67 | 604 | // The format of each value, i.e., block handle, is (offset, size) whenever the |
11fdf7f2 TL |
605 | // shared_size is 0, which included the first entry in each restart point. |
606 | // Otherwise the format is delta-size = block handle size - size of last block | |
607 | // handle. | |
608 | void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { | |
f67539c2 TL |
609 | Slice v(value_.data(), data_ + restarts_ - value_.data()); |
610 | // Delta encoding is used if `shared` != 0. | |
611 | Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( | |
612 | &v, have_first_key_, | |
613 | (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); | |
614 | assert(decode_s.ok()); | |
615 | value_ = Slice(value_.data(), v.data() - value_.data()); | |
616 | ||
617 | if (global_seqno_state_ != nullptr) { | |
618 | // Overwrite sequence number the same way as in DataBlockIter. | |
619 | ||
620 | IterKey& first_internal_key = global_seqno_state_->first_internal_key; | |
621 | first_internal_key.SetInternalKey(decoded_value_.first_internal_key, | |
622 | /* copy */ true); | |
623 | ||
624 | assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); | |
625 | ||
626 | ValueType value_type = ExtractValueType(first_internal_key.GetKey()); | |
627 | assert(value_type == ValueType::kTypeValue || | |
628 | value_type == ValueType::kTypeMerge || | |
629 | value_type == ValueType::kTypeDeletion || | |
630 | value_type == ValueType::kTypeRangeDeletion); | |
631 | ||
632 | first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, | |
633 | value_type); | |
634 | decoded_value_.first_internal_key = first_internal_key.GetKey(); | |
7c673cae FG |
635 | } |
636 | } | |
637 | ||
20effc67 TL |
638 | template <class TValue> |
639 | void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target, | |
640 | uint32_t index, | |
641 | bool skip_linear_scan) { | |
642 | // SeekToRestartPoint() only does the lookup in the restart block. We need | |
643 | // to follow it up with NextImpl() to position the iterator at the restart | |
644 | // key. | |
645 | SeekToRestartPoint(index); | |
646 | NextImpl(); | |
647 | ||
648 | if (!skip_linear_scan) { | |
649 | // Linear search (within restart block) for first key >= target | |
650 | uint32_t max_offset; | |
651 | if (index + 1 < num_restarts_) { | |
652 | // We are in a non-last restart interval. Since `BinarySeek()` guarantees | |
653 | // the next restart key is strictly greater than `target`, we can | |
654 | // terminate upon reaching it without any additional key comparison. | |
655 | max_offset = GetRestartPoint(index + 1); | |
656 | } else { | |
657 | // We are in the last restart interval. The while-loop will terminate by | |
658 | // `Valid()` returning false upon advancing past the block's last key. | |
659 | max_offset = port::kMaxUint32; | |
660 | } | |
661 | while (true) { | |
662 | NextImpl(); | |
663 | if (!Valid()) { | |
664 | break; | |
665 | } | |
666 | if (current_ == max_offset) { | |
667 | assert(CompareCurrentKey(target) > 0); | |
668 | break; | |
669 | } else if (CompareCurrentKey(target) >= 0) { | |
670 | break; | |
671 | } | |
672 | } | |
673 | } | |
674 | } | |
675 | ||
676 | // Binary searches in restart array to find the starting restart point for the | |
677 | // linear scan, and stores it in `*index`. Assumes restart array does not | |
678 | // contain duplicate keys. It is guaranteed that the restart key at `*index + 1` | |
679 | // is strictly greater than `target` or does not exist (this can be used to | |
680 | // elide a comparison when linear scan reaches all the way to the next restart | |
681 | // key). Furthermore, `*skip_linear_scan` is set to indicate whether the | |
682 | // `*index`th restart key is the final result so that key does not need to be | |
683 | // compared again later. | |
11fdf7f2 TL |
684 | template <class TValue> |
685 | template <typename DecodeKeyFunc> | |
20effc67 TL |
686 | bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index, |
687 | bool* skip_linear_scan) { | |
688 | if (restarts_ == 0) { | |
689 | // SST files dedicated to range tombstones are written with index blocks | |
690 | // that have no keys while also having `num_restarts_ == 1`. This would | |
691 | // cause a problem for `BinarySeek()` as it'd try to access the first key | |
692 | // which does not exist. We identify such blocks by the offset at which | |
693 | // their restarts are stored, and return false to prevent any attempted | |
694 | // key accesses. | |
695 | return false; | |
696 | } | |
7c673cae | 697 | |
20effc67 TL |
698 | *skip_linear_scan = false; |
699 | // Loop invariants: | |
700 | // - Restart key at index `left` is less than or equal to the target key. The | |
701 | // sentinel index `-1` is considered to have a key that is less than all | |
702 | // keys. | |
703 | // - Any restart keys after index `right` are strictly greater than the target | |
704 | // key. | |
705 | int64_t left = -1, right = num_restarts_ - 1; | |
706 | while (left != right) { | |
707 | // The `mid` is computed by rounding up so it lands in (`left`, `right`]. | |
708 | int64_t mid = left + (right - left + 1) / 2; | |
709 | uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid)); | |
11fdf7f2 TL |
710 | uint32_t shared, non_shared; |
711 | const char* key_ptr = DecodeKeyFunc()( | |
712 | data_ + region_offset, data_ + restarts_, &shared, &non_shared); | |
7c673cae FG |
713 | if (key_ptr == nullptr || (shared != 0)) { |
714 | CorruptionError(); | |
715 | return false; | |
716 | } | |
717 | Slice mid_key(key_ptr, non_shared); | |
20effc67 TL |
718 | raw_key_.SetKey(mid_key, false /* copy */); |
719 | int cmp = CompareCurrentKey(target); | |
7c673cae FG |
720 | if (cmp < 0) { |
721 | // Key at "mid" is smaller than "target". Therefore all | |
722 | // blocks before "mid" are uninteresting. | |
723 | left = mid; | |
724 | } else if (cmp > 0) { | |
725 | // Key at "mid" is >= "target". Therefore all blocks at or | |
726 | // after "mid" are uninteresting. | |
727 | right = mid - 1; | |
728 | } else { | |
20effc67 | 729 | *skip_linear_scan = true; |
7c673cae FG |
730 | left = right = mid; |
731 | } | |
732 | } | |
733 | ||
20effc67 TL |
734 | if (left == -1) { |
735 | // All keys in the block were strictly greater than `target`. So the very | |
736 | // first key in the block is the final seek result. | |
737 | *skip_linear_scan = true; | |
738 | *index = 0; | |
739 | } else { | |
740 | *index = static_cast<uint32_t>(left); | |
741 | } | |
7c673cae FG |
742 | return true; |
743 | } | |
744 | ||
745 | // Compare target key and the block key of the block of `block_index`. | |
746 | // Return -1 if error. | |
11fdf7f2 | 747 | int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
7c673cae | 748 | uint32_t region_offset = GetRestartPoint(block_index); |
11fdf7f2 TL |
749 | uint32_t shared, non_shared; |
750 | const char* key_ptr = | |
751 | value_delta_encoded_ | |
752 | ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, | |
753 | &non_shared) | |
754 | : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, | |
755 | &non_shared); | |
7c673cae FG |
756 | if (key_ptr == nullptr || (shared != 0)) { |
757 | CorruptionError(); | |
758 | return 1; // Return target is smaller | |
759 | } | |
760 | Slice block_key(key_ptr, non_shared); | |
20effc67 TL |
761 | raw_key_.SetKey(block_key, false /* copy */); |
762 | return CompareCurrentKey(target); | |
7c673cae FG |
763 | } |
764 | ||
765 | // Binary search in block_ids to find the first block | |
766 | // with a key >= target | |
11fdf7f2 TL |
767 | bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, |
768 | uint32_t* block_ids, uint32_t left, | |
f67539c2 TL |
769 | uint32_t right, uint32_t* index, |
770 | bool* prefix_may_exist) { | |
7c673cae | 771 | assert(left <= right); |
f67539c2 TL |
772 | assert(index); |
773 | assert(prefix_may_exist); | |
774 | *prefix_may_exist = true; | |
7c673cae FG |
775 | uint32_t left_bound = left; |
776 | ||
777 | while (left <= right) { | |
778 | uint32_t mid = (right + left) / 2; | |
779 | ||
780 | int cmp = CompareBlockKey(block_ids[mid], target); | |
781 | if (!status_.ok()) { | |
782 | return false; | |
783 | } | |
784 | if (cmp < 0) { | |
785 | // Key at "target" is larger than "mid". Therefore all | |
786 | // blocks before or at "mid" are uninteresting. | |
787 | left = mid + 1; | |
788 | } else { | |
789 | // Key at "target" is <= "mid". Therefore all blocks | |
790 | // after "mid" are uninteresting. | |
791 | // If there is only one block left, we found it. | |
792 | if (left == right) break; | |
793 | right = mid; | |
794 | } | |
795 | } | |
796 | ||
797 | if (left == right) { | |
798 | // In one of the two following cases: | |
799 | // (1) left is the first one of block_ids | |
800 | // (2) there is a gap of blocks between block of `left` and `left-1`. | |
801 | // we can further distinguish the case of key in the block or key not | |
802 | // existing, by comparing the target key and the key of the previous | |
803 | // block to the left of the block found. | |
804 | if (block_ids[left] > 0 && | |
805 | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && | |
806 | CompareBlockKey(block_ids[left] - 1, target) > 0) { | |
807 | current_ = restarts_; | |
f67539c2 | 808 | *prefix_may_exist = false; |
7c673cae FG |
809 | return false; |
810 | } | |
811 | ||
812 | *index = block_ids[left]; | |
813 | return true; | |
814 | } else { | |
815 | assert(left > right); | |
f67539c2 TL |
816 | |
817 | // If the next block key is larger than seek key, it is possible that | |
818 | // no key shares the prefix with `target`, or all keys with the same | |
819 | // prefix as `target` are smaller than prefix. In the latter case, | |
820 | // we are mandated to set the position the same as the total order. | |
821 | // In the latter case, either: | |
822 | // (1) `target` falls into the range of the next block. In this case, | |
823 | // we can place the iterator to the next block, or | |
824 | // (2) `target` is larger than all block keys. In this case we can | |
825 | // keep the iterator invalidate without setting `prefix_may_exist` | |
826 | // to false. | |
827 | // We might sometimes end up with setting the total order position | |
828 | // while there is no key sharing the prefix as `target`, but it | |
829 | // still follows the contract. | |
830 | uint32_t right_index = block_ids[right]; | |
831 | assert(right_index + 1 <= num_restarts_); | |
832 | if (right_index + 1 < num_restarts_) { | |
833 | if (CompareBlockKey(right_index + 1, target) >= 0) { | |
834 | *index = right_index + 1; | |
835 | return true; | |
836 | } else { | |
837 | // We have to set the flag here because we are not positioning | |
838 | // the iterator to the total order position. | |
839 | *prefix_may_exist = false; | |
840 | } | |
841 | } | |
842 | ||
7c673cae FG |
843 | // Mark iterator invalid |
844 | current_ = restarts_; | |
845 | return false; | |
846 | } | |
847 | } | |
848 | ||
f67539c2 TL |
849 | bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, |
850 | bool* prefix_may_exist) { | |
851 | assert(index); | |
852 | assert(prefix_may_exist); | |
7c673cae | 853 | assert(prefix_index_); |
f67539c2 | 854 | *prefix_may_exist = true; |
11fdf7f2 | 855 | Slice seek_key = target; |
20effc67 | 856 | if (raw_key_.IsUserKey()) { |
11fdf7f2 TL |
857 | seek_key = ExtractUserKey(target); |
858 | } | |
7c673cae FG |
859 | uint32_t* block_ids = nullptr; |
860 | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); | |
861 | ||
862 | if (num_blocks == 0) { | |
863 | current_ = restarts_; | |
f67539c2 | 864 | *prefix_may_exist = false; |
7c673cae | 865 | return false; |
494da23a | 866 | } else { |
f67539c2 TL |
867 | assert(block_ids); |
868 | return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, | |
869 | prefix_may_exist); | |
7c673cae FG |
870 | } |
871 | } | |
872 | ||
873 | uint32_t Block::NumRestarts() const { | |
494da23a | 874 | assert(size_ >= 2 * sizeof(uint32_t)); |
11fdf7f2 TL |
875 | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
876 | uint32_t num_restarts = block_footer; | |
877 | if (size_ > kMaxBlockSizeSupportedByHashIndex) { | |
878 | // In BlockBuilder, we have ensured a block with HashIndex is less than | |
879 | // kMaxBlockSizeSupportedByHashIndex (64KiB). | |
880 | // | |
881 | // Therefore, if we encounter a block with a size > 64KiB, the block | |
882 | // cannot have HashIndex. So the footer will directly interpreted as | |
883 | // num_restarts. | |
884 | // | |
885 | // Such check is for backward compatibility. We can ensure legacy block | |
886 | // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted | |
887 | // correctly as no HashIndex even if the MSB of num_restarts is set. | |
888 | return num_restarts; | |
889 | } | |
890 | BlockBasedTableOptions::DataBlockIndexType index_type; | |
891 | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); | |
892 | return num_restarts; | |
893 | } | |
894 | ||
895 | BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { | |
896 | assert(size_ >= 2 * sizeof(uint32_t)); | |
897 | if (size_ > kMaxBlockSizeSupportedByHashIndex) { | |
898 | // The check is for the same reason as that in NumRestarts() | |
899 | return BlockBasedTableOptions::kDataBlockBinarySearch; | |
900 | } | |
901 | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); | |
902 | uint32_t num_restarts = block_footer; | |
903 | BlockBasedTableOptions::DataBlockIndexType index_type; | |
904 | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); | |
905 | return index_type; | |
906 | } | |
907 | ||
908 | Block::~Block() { | |
909 | // This sync point can be re-enabled if RocksDB can control the | |
910 | // initialization order of any/all static options created by the user. | |
911 | // TEST_SYNC_POINT("Block::~Block"); | |
7c673cae FG |
912 | } |
913 | ||
20effc67 TL |
914 | Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, |
915 | Statistics* statistics) | |
7c673cae FG |
916 | : contents_(std::move(contents)), |
917 | data_(contents_.data.data()), | |
918 | size_(contents_.data.size()), | |
11fdf7f2 | 919 | restart_offset_(0), |
20effc67 | 920 | num_restarts_(0) { |
11fdf7f2 | 921 | TEST_SYNC_POINT("Block::Block:0"); |
7c673cae FG |
922 | if (size_ < sizeof(uint32_t)) { |
923 | size_ = 0; // Error marker | |
924 | } else { | |
11fdf7f2 | 925 | // Should only decode restart points for uncompressed blocks |
494da23a TL |
926 | num_restarts_ = NumRestarts(); |
927 | switch (IndexType()) { | |
928 | case BlockBasedTableOptions::kDataBlockBinarySearch: | |
929 | restart_offset_ = static_cast<uint32_t>(size_) - | |
930 | (1 + num_restarts_) * sizeof(uint32_t); | |
931 | if (restart_offset_ > size_ - sizeof(uint32_t)) { | |
932 | // The size is too small for NumRestarts() and therefore | |
933 | // restart_offset_ wrapped around. | |
934 | size_ = 0; | |
935 | } | |
936 | break; | |
937 | case BlockBasedTableOptions::kDataBlockBinaryAndHash: | |
938 | if (size_ < sizeof(uint32_t) /* block footer */ + | |
939 | sizeof(uint16_t) /* NUM_BUCK */) { | |
940 | size_ = 0; | |
11fdf7f2 | 941 | break; |
494da23a TL |
942 | } |
943 | ||
944 | uint16_t map_offset; | |
945 | data_block_hash_index_.Initialize( | |
946 | contents.data.data(), | |
947 | static_cast<uint16_t>(contents.data.size() - | |
948 | sizeof(uint32_t)), /*chop off | |
949 | NUM_RESTARTS*/ | |
950 | &map_offset); | |
951 | ||
952 | restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); | |
953 | ||
954 | if (restart_offset_ > map_offset) { | |
955 | // map_offset is too small for NumRestarts() and | |
956 | // therefore restart_offset_ wrapped around. | |
957 | size_ = 0; | |
11fdf7f2 | 958 | break; |
494da23a TL |
959 | } |
960 | break; | |
961 | default: | |
962 | size_ = 0; // Error marker | |
7c673cae FG |
963 | } |
964 | } | |
965 | if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { | |
966 | read_amp_bitmap_.reset(new BlockReadAmpBitmap( | |
967 | restart_offset_, read_amp_bytes_per_bit, statistics)); | |
968 | } | |
969 | } | |
970 | ||
20effc67 TL |
971 | DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, |
972 | SequenceNumber global_seqno, | |
f67539c2 TL |
973 | DataBlockIter* iter, Statistics* stats, |
974 | bool block_contents_pinned) { | |
11fdf7f2 TL |
975 | DataBlockIter* ret_iter; |
976 | if (iter != nullptr) { | |
977 | ret_iter = iter; | |
978 | } else { | |
979 | ret_iter = new DataBlockIter; | |
7c673cae | 980 | } |
11fdf7f2 TL |
981 | if (size_ < 2 * sizeof(uint32_t)) { |
982 | ret_iter->Invalidate(Status::Corruption("bad block contents")); | |
983 | return ret_iter; | |
984 | } | |
985 | if (num_restarts_ == 0) { | |
986 | // Empty block. | |
987 | ret_iter->Invalidate(Status::OK()); | |
988 | return ret_iter; | |
7c673cae | 989 | } else { |
11fdf7f2 | 990 | ret_iter->Initialize( |
20effc67 | 991 | raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, |
494da23a | 992 | read_amp_bitmap_.get(), block_contents_pinned, |
11fdf7f2 | 993 | data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); |
7c673cae FG |
994 | if (read_amp_bitmap_) { |
995 | if (read_amp_bitmap_->GetStatistics() != stats) { | |
996 | // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ | |
997 | read_amp_bitmap_->SetStatistics(stats); | |
998 | } | |
999 | } | |
1000 | } | |
1001 | ||
11fdf7f2 | 1002 | return ret_iter; |
7c673cae FG |
1003 | } |
1004 | ||
f67539c2 | 1005 | IndexBlockIter* Block::NewIndexIterator( |
20effc67 TL |
1006 | const Comparator* raw_ucmp, SequenceNumber global_seqno, |
1007 | IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, | |
1008 | bool have_first_key, bool key_includes_seq, bool value_is_full, | |
1009 | bool block_contents_pinned, BlockPrefixIndex* prefix_index) { | |
11fdf7f2 TL |
1010 | IndexBlockIter* ret_iter; |
1011 | if (iter != nullptr) { | |
1012 | ret_iter = iter; | |
1013 | } else { | |
1014 | ret_iter = new IndexBlockIter; | |
1015 | } | |
1016 | if (size_ < 2 * sizeof(uint32_t)) { | |
1017 | ret_iter->Invalidate(Status::Corruption("bad block contents")); | |
1018 | return ret_iter; | |
1019 | } | |
1020 | if (num_restarts_ == 0) { | |
1021 | // Empty block. | |
1022 | ret_iter->Invalidate(Status::OK()); | |
1023 | return ret_iter; | |
1024 | } else { | |
1025 | BlockPrefixIndex* prefix_index_ptr = | |
1026 | total_order_seek ? nullptr : prefix_index; | |
20effc67 TL |
1027 | ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, |
1028 | global_seqno, prefix_index_ptr, have_first_key, | |
f67539c2 TL |
1029 | key_includes_seq, value_is_full, |
1030 | block_contents_pinned); | |
11fdf7f2 TL |
1031 | } |
1032 | ||
1033 | return ret_iter; | |
7c673cae FG |
1034 | } |
1035 | ||
1036 | size_t Block::ApproximateMemoryUsage() const { | |
1037 | size_t usage = usable_size(); | |
11fdf7f2 TL |
1038 | #ifdef ROCKSDB_MALLOC_USABLE_SIZE |
1039 | usage += malloc_usable_size((void*)this); | |
1040 | #else | |
1041 | usage += sizeof(*this); | |
1042 | #endif // ROCKSDB_MALLOC_USABLE_SIZE | |
1043 | if (read_amp_bitmap_) { | |
1044 | usage += read_amp_bitmap_->ApproximateMemoryUsage(); | |
7c673cae FG |
1045 | } |
1046 | return usage; | |
1047 | } | |
1048 | ||
f67539c2 | 1049 | } // namespace ROCKSDB_NAMESPACE |