]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be | |
3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
4 | ||
5 | #ifndef ROCKSDB_LITE | |
6 | ||
7 | #include "table/plain_table_reader.h" | |
8 | ||
9 | #include <string> | |
10 | #include <vector> | |
11 | ||
12 | #include "db/dbformat.h" | |
13 | ||
14 | #include "rocksdb/cache.h" | |
15 | #include "rocksdb/comparator.h" | |
16 | #include "rocksdb/env.h" | |
17 | #include "rocksdb/filter_policy.h" | |
18 | #include "rocksdb/options.h" | |
19 | #include "rocksdb/statistics.h" | |
20 | ||
21 | #include "table/block.h" | |
22 | #include "table/bloom_block.h" | |
23 | #include "table/filter_block.h" | |
24 | #include "table/format.h" | |
25 | #include "table/internal_iterator.h" | |
26 | #include "table/meta_blocks.h" | |
27 | #include "table/two_level_iterator.h" | |
28 | #include "table/plain_table_factory.h" | |
29 | #include "table/plain_table_key_coding.h" | |
30 | #include "table/get_context.h" | |
31 | ||
32 | #include "monitoring/histogram.h" | |
33 | #include "monitoring/perf_context_imp.h" | |
34 | #include "util/arena.h" | |
35 | #include "util/coding.h" | |
36 | #include "util/dynamic_bloom.h" | |
37 | #include "util/hash.h" | |
38 | #include "util/murmurhash.h" | |
39 | #include "util/stop_watch.h" | |
40 | #include "util/string_util.h" | |
41 | ||
42 | namespace rocksdb { | |
43 | ||
44 | namespace { | |
45 | ||
46 | // Safely getting a uint32_t element from a char array, where, starting from | |
47 | // `base`, every 4 bytes are considered as an fixed 32 bit integer. | |
48 | inline uint32_t GetFixed32Element(const char* base, size_t offset) { | |
49 | return DecodeFixed32(base + offset * sizeof(uint32_t)); | |
50 | } | |
51 | } // namespace | |
52 | ||
53 | // Iterator to iterate IndexedTable | |
54 | class PlainTableIterator : public InternalIterator { | |
55 | public: | |
56 | explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); | |
57 | ~PlainTableIterator(); | |
58 | ||
59 | bool Valid() const override; | |
60 | ||
61 | void SeekToFirst() override; | |
62 | ||
63 | void SeekToLast() override; | |
64 | ||
65 | void Seek(const Slice& target) override; | |
66 | ||
67 | void SeekForPrev(const Slice& target) override; | |
68 | ||
69 | void Next() override; | |
70 | ||
71 | void Prev() override; | |
72 | ||
73 | Slice key() const override; | |
74 | ||
75 | Slice value() const override; | |
76 | ||
77 | Status status() const override; | |
78 | ||
79 | private: | |
80 | PlainTableReader* table_; | |
81 | PlainTableKeyDecoder decoder_; | |
82 | bool use_prefix_seek_; | |
83 | uint32_t offset_; | |
84 | uint32_t next_offset_; | |
85 | Slice key_; | |
86 | Slice value_; | |
87 | Status status_; | |
88 | // No copying allowed | |
89 | PlainTableIterator(const PlainTableIterator&) = delete; | |
90 | void operator=(const Iterator&) = delete; | |
91 | }; | |
92 | ||
93 | extern const uint64_t kPlainTableMagicNumber; | |
94 | PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, | |
95 | unique_ptr<RandomAccessFileReader>&& file, | |
96 | const EnvOptions& storage_options, | |
97 | const InternalKeyComparator& icomparator, | |
98 | EncodingType encoding_type, | |
99 | uint64_t file_size, | |
100 | const TableProperties* table_properties) | |
101 | : internal_comparator_(icomparator), | |
102 | encoding_type_(encoding_type), | |
103 | full_scan_mode_(false), | |
104 | user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), | |
105 | prefix_extractor_(ioptions.prefix_extractor), | |
106 | enable_bloom_(false), | |
107 | bloom_(6, nullptr), | |
108 | file_info_(std::move(file), storage_options, | |
109 | static_cast<uint32_t>(table_properties->data_size)), | |
110 | ioptions_(ioptions), | |
111 | file_size_(file_size), | |
112 | table_properties_(nullptr) {} | |
113 | ||
114 | PlainTableReader::~PlainTableReader() { | |
115 | } | |
116 | ||
117 | Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, | |
118 | const EnvOptions& env_options, | |
119 | const InternalKeyComparator& internal_comparator, | |
120 | unique_ptr<RandomAccessFileReader>&& file, | |
121 | uint64_t file_size, | |
122 | unique_ptr<TableReader>* table_reader, | |
123 | const int bloom_bits_per_key, | |
124 | double hash_table_ratio, size_t index_sparseness, | |
125 | size_t huge_page_tlb_size, bool full_scan_mode) { | |
126 | if (file_size > PlainTableIndex::kMaxFileSize) { | |
127 | return Status::NotSupported("File is too large for PlainTableReader!"); | |
128 | } | |
129 | ||
130 | TableProperties* props = nullptr; | |
131 | auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, | |
132 | ioptions, &props); | |
133 | if (!s.ok()) { | |
134 | return s; | |
135 | } | |
136 | ||
137 | assert(hash_table_ratio >= 0.0); | |
138 | auto& user_props = props->user_collected_properties; | |
139 | auto prefix_extractor_in_file = props->prefix_extractor_name; | |
140 | ||
141 | if (!full_scan_mode && | |
142 | !prefix_extractor_in_file.empty() /* old version sst file*/ | |
143 | && prefix_extractor_in_file != "nullptr") { | |
144 | if (!ioptions.prefix_extractor) { | |
145 | return Status::InvalidArgument( | |
146 | "Prefix extractor is missing when opening a PlainTable built " | |
147 | "using a prefix extractor"); | |
148 | } else if (prefix_extractor_in_file.compare( | |
149 | ioptions.prefix_extractor->Name()) != 0) { | |
150 | return Status::InvalidArgument( | |
151 | "Prefix extractor given doesn't match the one used to build " | |
152 | "PlainTable"); | |
153 | } | |
154 | } | |
155 | ||
156 | EncodingType encoding_type = kPlain; | |
157 | auto encoding_type_prop = | |
158 | user_props.find(PlainTablePropertyNames::kEncodingType); | |
159 | if (encoding_type_prop != user_props.end()) { | |
160 | encoding_type = static_cast<EncodingType>( | |
161 | DecodeFixed32(encoding_type_prop->second.c_str())); | |
162 | } | |
163 | ||
164 | std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( | |
165 | ioptions, std::move(file), env_options, internal_comparator, | |
166 | encoding_type, file_size, props)); | |
167 | ||
168 | s = new_reader->MmapDataIfNeeded(); | |
169 | if (!s.ok()) { | |
170 | return s; | |
171 | } | |
172 | ||
173 | if (!full_scan_mode) { | |
174 | s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, | |
175 | index_sparseness, huge_page_tlb_size); | |
176 | if (!s.ok()) { | |
177 | return s; | |
178 | } | |
179 | } else { | |
180 | // Flag to indicate it is a full scan mode so that none of the indexes | |
181 | // can be used. | |
182 | new_reader->full_scan_mode_ = true; | |
183 | } | |
184 | ||
185 | *table_reader = std::move(new_reader); | |
186 | return s; | |
187 | } | |
188 | ||
189 | void PlainTableReader::SetupForCompaction() { | |
190 | } | |
191 | ||
192 | InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options, | |
193 | Arena* arena, | |
194 | bool skip_filters) { | |
195 | if (options.total_order_seek && !IsTotalOrderMode()) { | |
196 | return NewErrorInternalIterator( | |
197 | Status::InvalidArgument("total_order_seek not supported"), arena); | |
198 | } | |
199 | if (arena == nullptr) { | |
200 | return new PlainTableIterator(this, prefix_extractor_ != nullptr); | |
201 | } else { | |
202 | auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); | |
203 | return new (mem) PlainTableIterator(this, prefix_extractor_ != nullptr); | |
204 | } | |
205 | } | |
206 | ||
207 | Status PlainTableReader::PopulateIndexRecordList( | |
208 | PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) { | |
209 | Slice prev_key_prefix_slice; | |
210 | std::string prev_key_prefix_buf; | |
211 | uint32_t pos = data_start_offset_; | |
212 | ||
213 | bool is_first_record = true; | |
214 | Slice key_prefix_slice; | |
215 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
216 | ioptions_.prefix_extractor); | |
217 | while (pos < file_info_.data_end_offset) { | |
218 | uint32_t key_offset = pos; | |
219 | ParsedInternalKey key; | |
220 | Slice value_slice; | |
221 | bool seekable = false; | |
222 | Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); | |
223 | if (!s.ok()) { | |
224 | return s; | |
225 | } | |
226 | ||
227 | key_prefix_slice = GetPrefix(key); | |
228 | if (enable_bloom_) { | |
229 | bloom_.AddHash(GetSliceHash(key.user_key)); | |
230 | } else { | |
231 | if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { | |
232 | if (!is_first_record) { | |
233 | prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); | |
234 | } | |
235 | if (file_info_.is_mmap_mode) { | |
236 | prev_key_prefix_slice = key_prefix_slice; | |
237 | } else { | |
238 | prev_key_prefix_buf = key_prefix_slice.ToString(); | |
239 | prev_key_prefix_slice = prev_key_prefix_buf; | |
240 | } | |
241 | } | |
242 | } | |
243 | ||
244 | index_builder->AddKeyPrefix(GetPrefix(key), key_offset); | |
245 | ||
246 | if (!seekable && is_first_record) { | |
247 | return Status::Corruption("Key for a prefix is not seekable"); | |
248 | } | |
249 | ||
250 | is_first_record = false; | |
251 | } | |
252 | ||
253 | prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); | |
254 | auto s = index_.InitFromRawData(index_builder->Finish()); | |
255 | return s; | |
256 | } | |
257 | ||
258 | void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, | |
259 | int num_prefixes, | |
260 | size_t huge_page_tlb_size, | |
261 | vector<uint32_t>* prefix_hashes) { | |
262 | if (!IsTotalOrderMode()) { | |
263 | uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; | |
264 | if (bloom_total_bits > 0) { | |
265 | enable_bloom_ = true; | |
266 | bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, | |
267 | huge_page_tlb_size, ioptions_.info_log); | |
268 | FillBloom(prefix_hashes); | |
269 | } | |
270 | } | |
271 | } | |
272 | ||
273 | void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) { | |
274 | assert(bloom_.IsInitialized()); | |
275 | for (auto prefix_hash : *prefix_hashes) { | |
276 | bloom_.AddHash(prefix_hash); | |
277 | } | |
278 | } | |
279 | ||
280 | Status PlainTableReader::MmapDataIfNeeded() { | |
281 | if (file_info_.is_mmap_mode) { | |
282 | // Get mmapped memory. | |
283 | return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr); | |
284 | } | |
285 | return Status::OK(); | |
286 | } | |
287 | ||
288 | Status PlainTableReader::PopulateIndex(TableProperties* props, | |
289 | int bloom_bits_per_key, | |
290 | double hash_table_ratio, | |
291 | size_t index_sparseness, | |
292 | size_t huge_page_tlb_size) { | |
293 | assert(props != nullptr); | |
294 | table_properties_.reset(props); | |
295 | ||
296 | BlockContents index_block_contents; | |
297 | Status s = ReadMetaBlock( | |
298 | file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_, | |
299 | PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_contents); | |
300 | ||
301 | bool index_in_file = s.ok(); | |
302 | ||
303 | BlockContents bloom_block_contents; | |
304 | bool bloom_in_file = false; | |
305 | // We only need to read the bloom block if index block is in file. | |
306 | if (index_in_file) { | |
307 | s = ReadMetaBlock(file_info_.file.get(), file_size_, kPlainTableMagicNumber, | |
308 | ioptions_, BloomBlockBuilder::kBloomBlock, | |
309 | &bloom_block_contents); | |
310 | bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; | |
311 | } | |
312 | ||
313 | Slice* bloom_block; | |
314 | if (bloom_in_file) { | |
315 | // If bloom_block_contents.allocation is not empty (which will be the case | |
316 | // for non-mmap mode), it holds the alloated memory for the bloom block. | |
317 | // It needs to be kept alive to keep `bloom_block` valid. | |
318 | bloom_block_alloc_ = std::move(bloom_block_contents.allocation); | |
319 | bloom_block = &bloom_block_contents.data; | |
320 | } else { | |
321 | bloom_block = nullptr; | |
322 | } | |
323 | ||
324 | Slice* index_block; | |
325 | if (index_in_file) { | |
326 | // If index_block_contents.allocation is not empty (which will be the case | |
327 | // for non-mmap mode), it holds the alloated memory for the index block. | |
328 | // It needs to be kept alive to keep `index_block` valid. | |
329 | index_block_alloc_ = std::move(index_block_contents.allocation); | |
330 | index_block = &index_block_contents.data; | |
331 | } else { | |
332 | index_block = nullptr; | |
333 | } | |
334 | ||
335 | if ((ioptions_.prefix_extractor == nullptr) && | |
336 | (hash_table_ratio != 0)) { | |
337 | // ioptions.prefix_extractor is requried for a hash-based look-up. | |
338 | return Status::NotSupported( | |
339 | "PlainTable requires a prefix extractor enable prefix hash mode."); | |
340 | } | |
341 | ||
342 | // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows | |
343 | // for a prefix (starting from the first one), generate a record of (hash, | |
344 | // offset) and append it to IndexRecordList, which is a data structure created | |
345 | // to store them. | |
346 | ||
347 | if (!index_in_file) { | |
348 | // Allocate bloom filter here for total order mode. | |
349 | if (IsTotalOrderMode()) { | |
350 | uint32_t num_bloom_bits = | |
351 | static_cast<uint32_t>(table_properties_->num_entries) * | |
352 | bloom_bits_per_key; | |
353 | if (num_bloom_bits > 0) { | |
354 | enable_bloom_ = true; | |
355 | bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, | |
356 | huge_page_tlb_size, ioptions_.info_log); | |
357 | } | |
358 | } | |
359 | } else if (bloom_in_file) { | |
360 | enable_bloom_ = true; | |
361 | auto num_blocks_property = props->user_collected_properties.find( | |
362 | PlainTablePropertyNames::kNumBloomBlocks); | |
363 | ||
364 | uint32_t num_blocks = 0; | |
365 | if (num_blocks_property != props->user_collected_properties.end()) { | |
366 | Slice temp_slice(num_blocks_property->second); | |
367 | if (!GetVarint32(&temp_slice, &num_blocks)) { | |
368 | num_blocks = 0; | |
369 | } | |
370 | } | |
371 | // cast away const qualifier, because bloom_ won't be changed | |
372 | bloom_.SetRawData( | |
373 | const_cast<unsigned char*>( | |
374 | reinterpret_cast<const unsigned char*>(bloom_block->data())), | |
375 | static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks); | |
376 | } else { | |
377 | // Index in file but no bloom in file. Disable bloom filter in this case. | |
378 | enable_bloom_ = false; | |
379 | bloom_bits_per_key = 0; | |
380 | } | |
381 | ||
382 | PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, | |
383 | hash_table_ratio, huge_page_tlb_size); | |
384 | ||
385 | std::vector<uint32_t> prefix_hashes; | |
386 | if (!index_in_file) { | |
387 | s = PopulateIndexRecordList(&index_builder, &prefix_hashes); | |
388 | if (!s.ok()) { | |
389 | return s; | |
390 | } | |
391 | } else { | |
392 | s = index_.InitFromRawData(*index_block); | |
393 | if (!s.ok()) { | |
394 | return s; | |
395 | } | |
396 | } | |
397 | ||
398 | if (!index_in_file) { | |
399 | // Calculated bloom filter size and allocate memory for | |
400 | // bloom filter based on the number of prefixes, then fill it. | |
401 | AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), | |
402 | huge_page_tlb_size, &prefix_hashes); | |
403 | } | |
404 | ||
405 | // Fill two table properties. | |
406 | if (!index_in_file) { | |
407 | props->user_collected_properties["plain_table_hash_table_size"] = | |
408 | ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); | |
409 | props->user_collected_properties["plain_table_sub_index_size"] = | |
410 | ToString(index_.GetSubIndexSize()); | |
411 | } else { | |
412 | props->user_collected_properties["plain_table_hash_table_size"] = | |
413 | ToString(0); | |
414 | props->user_collected_properties["plain_table_sub_index_size"] = | |
415 | ToString(0); | |
416 | } | |
417 | ||
418 | return Status::OK(); | |
419 | } | |
420 | ||
421 | Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, | |
422 | const Slice& target, const Slice& prefix, | |
423 | uint32_t prefix_hash, bool& prefix_matched, | |
424 | uint32_t* offset) const { | |
425 | prefix_matched = false; | |
426 | uint32_t prefix_index_offset; | |
427 | auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); | |
428 | if (res == PlainTableIndex::kNoPrefixForBucket) { | |
429 | *offset = file_info_.data_end_offset; | |
430 | return Status::OK(); | |
431 | } else if (res == PlainTableIndex::kDirectToFile) { | |
432 | *offset = prefix_index_offset; | |
433 | return Status::OK(); | |
434 | } | |
435 | ||
436 | // point to sub-index, need to do a binary search | |
437 | uint32_t upper_bound; | |
438 | const char* base_ptr = | |
439 | index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); | |
440 | uint32_t low = 0; | |
441 | uint32_t high = upper_bound; | |
442 | ParsedInternalKey mid_key; | |
443 | ParsedInternalKey parsed_target; | |
444 | if (!ParseInternalKey(target, &parsed_target)) { | |
445 | return Status::Corruption(Slice()); | |
446 | } | |
447 | ||
448 | // The key is between [low, high). Do a binary search between it. | |
449 | while (high - low > 1) { | |
450 | uint32_t mid = (high + low) / 2; | |
451 | uint32_t file_offset = GetFixed32Element(base_ptr, mid); | |
452 | uint32_t tmp; | |
453 | Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); | |
454 | if (!s.ok()) { | |
455 | return s; | |
456 | } | |
457 | int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); | |
458 | if (cmp_result < 0) { | |
459 | low = mid; | |
460 | } else { | |
461 | if (cmp_result == 0) { | |
462 | // Happen to have found the exact key or target is smaller than the | |
463 | // first key after base_offset. | |
464 | prefix_matched = true; | |
465 | *offset = file_offset; | |
466 | return Status::OK(); | |
467 | } else { | |
468 | high = mid; | |
469 | } | |
470 | } | |
471 | } | |
472 | // Both of the key at the position low or low+1 could share the same | |
473 | // prefix as target. We need to rule out one of them to avoid to go | |
474 | // to the wrong prefix. | |
475 | ParsedInternalKey low_key; | |
476 | uint32_t tmp; | |
477 | uint32_t low_key_offset = GetFixed32Element(base_ptr, low); | |
478 | Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); | |
479 | if (!s.ok()) { | |
480 | return s; | |
481 | } | |
482 | ||
483 | if (GetPrefix(low_key) == prefix) { | |
484 | prefix_matched = true; | |
485 | *offset = low_key_offset; | |
486 | } else if (low + 1 < upper_bound) { | |
487 | // There is possible a next prefix, return it | |
488 | prefix_matched = false; | |
489 | *offset = GetFixed32Element(base_ptr, low + 1); | |
490 | } else { | |
491 | // target is larger than a key of the last prefix in this bucket | |
492 | // but with a different prefix. Key does not exist. | |
493 | *offset = file_info_.data_end_offset; | |
494 | } | |
495 | return Status::OK(); | |
496 | } | |
497 | ||
498 | bool PlainTableReader::MatchBloom(uint32_t hash) const { | |
499 | if (!enable_bloom_) { | |
500 | return true; | |
501 | } | |
502 | ||
503 | if (bloom_.MayContainHash(hash)) { | |
504 | PERF_COUNTER_ADD(bloom_sst_hit_count, 1); | |
505 | return true; | |
506 | } else { | |
507 | PERF_COUNTER_ADD(bloom_sst_miss_count, 1); | |
508 | return false; | |
509 | } | |
510 | } | |
511 | ||
512 | Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, | |
513 | ParsedInternalKey* parsed_key, | |
514 | Slice* internal_key, Slice* value, | |
515 | bool* seekable) const { | |
516 | if (*offset == file_info_.data_end_offset) { | |
517 | *offset = file_info_.data_end_offset; | |
518 | return Status::OK(); | |
519 | } | |
520 | ||
521 | if (*offset > file_info_.data_end_offset) { | |
522 | return Status::Corruption("Offset is out of file size"); | |
523 | } | |
524 | ||
525 | uint32_t bytes_read; | |
526 | Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, | |
527 | &bytes_read, seekable); | |
528 | if (!s.ok()) { | |
529 | return s; | |
530 | } | |
531 | *offset = *offset + bytes_read; | |
532 | return Status::OK(); | |
533 | } | |
534 | ||
535 | void PlainTableReader::Prepare(const Slice& target) { | |
536 | if (enable_bloom_) { | |
537 | uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); | |
538 | bloom_.Prefetch(prefix_hash); | |
539 | } | |
540 | } | |
541 | ||
542 | Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, | |
543 | GetContext* get_context, bool skip_filters) { | |
544 | // Check bloom filter first. | |
545 | Slice prefix_slice; | |
546 | uint32_t prefix_hash; | |
547 | if (IsTotalOrderMode()) { | |
548 | if (full_scan_mode_) { | |
549 | status_ = | |
550 | Status::InvalidArgument("Get() is not allowed in full scan mode."); | |
551 | } | |
552 | // Match whole user key for bloom filter check. | |
553 | if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { | |
554 | return Status::OK(); | |
555 | } | |
556 | // in total order mode, there is only one bucket 0, and we always use empty | |
557 | // prefix. | |
558 | prefix_slice = Slice(); | |
559 | prefix_hash = 0; | |
560 | } else { | |
561 | prefix_slice = GetPrefix(target); | |
562 | prefix_hash = GetSliceHash(prefix_slice); | |
563 | if (!MatchBloom(prefix_hash)) { | |
564 | return Status::OK(); | |
565 | } | |
566 | } | |
567 | uint32_t offset; | |
568 | bool prefix_match; | |
569 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
570 | ioptions_.prefix_extractor); | |
571 | Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, | |
572 | prefix_match, &offset); | |
573 | ||
574 | if (!s.ok()) { | |
575 | return s; | |
576 | } | |
577 | ParsedInternalKey found_key; | |
578 | ParsedInternalKey parsed_target; | |
579 | if (!ParseInternalKey(target, &parsed_target)) { | |
580 | return Status::Corruption(Slice()); | |
581 | } | |
582 | Slice found_value; | |
583 | while (offset < file_info_.data_end_offset) { | |
584 | s = Next(&decoder, &offset, &found_key, nullptr, &found_value); | |
585 | if (!s.ok()) { | |
586 | return s; | |
587 | } | |
588 | if (!prefix_match) { | |
589 | // Need to verify prefix for the first key found if it is not yet | |
590 | // checked. | |
591 | if (GetPrefix(found_key) != prefix_slice) { | |
592 | return Status::OK(); | |
593 | } | |
594 | prefix_match = true; | |
595 | } | |
596 | // TODO(ljin): since we know the key comparison result here, | |
597 | // can we enable the fast path? | |
598 | if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { | |
599 | if (!get_context->SaveValue(found_key, found_value)) { | |
600 | break; | |
601 | } | |
602 | } | |
603 | } | |
604 | return Status::OK(); | |
605 | } | |
606 | ||
607 | uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { | |
608 | return 0; | |
609 | } | |
610 | ||
611 | PlainTableIterator::PlainTableIterator(PlainTableReader* table, | |
612 | bool use_prefix_seek) | |
613 | : table_(table), | |
614 | decoder_(&table_->file_info_, table_->encoding_type_, | |
615 | table_->user_key_len_, table_->prefix_extractor_), | |
616 | use_prefix_seek_(use_prefix_seek) { | |
617 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
618 | } | |
619 | ||
620 | PlainTableIterator::~PlainTableIterator() { | |
621 | } | |
622 | ||
623 | bool PlainTableIterator::Valid() const { | |
624 | return offset_ < table_->file_info_.data_end_offset && | |
625 | offset_ >= table_->data_start_offset_; | |
626 | } | |
627 | ||
628 | void PlainTableIterator::SeekToFirst() { | |
629 | next_offset_ = table_->data_start_offset_; | |
630 | if (next_offset_ >= table_->file_info_.data_end_offset) { | |
631 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
632 | } else { | |
633 | Next(); | |
634 | } | |
635 | } | |
636 | ||
637 | void PlainTableIterator::SeekToLast() { | |
638 | assert(false); | |
639 | status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); | |
640 | } | |
641 | ||
642 | void PlainTableIterator::Seek(const Slice& target) { | |
643 | // If the user doesn't set prefix seek option and we are not able to do a | |
644 | // total Seek(). assert failure. | |
645 | if (!use_prefix_seek_) { | |
646 | if (table_->full_scan_mode_) { | |
647 | status_ = | |
648 | Status::InvalidArgument("Seek() is not allowed in full scan mode."); | |
649 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
650 | return; | |
651 | } else if (table_->GetIndexSize() > 1) { | |
652 | assert(false); | |
653 | status_ = Status::NotSupported( | |
654 | "PlainTable cannot issue non-prefix seek unless in total order " | |
655 | "mode."); | |
656 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
657 | return; | |
658 | } | |
659 | } | |
660 | ||
661 | Slice prefix_slice = table_->GetPrefix(target); | |
662 | uint32_t prefix_hash = 0; | |
663 | // Bloom filter is ignored in total-order mode. | |
664 | if (!table_->IsTotalOrderMode()) { | |
665 | prefix_hash = GetSliceHash(prefix_slice); | |
666 | if (!table_->MatchBloom(prefix_hash)) { | |
667 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
668 | return; | |
669 | } | |
670 | } | |
671 | bool prefix_match; | |
672 | status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, | |
673 | prefix_match, &next_offset_); | |
674 | if (!status_.ok()) { | |
675 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
676 | return; | |
677 | } | |
678 | ||
679 | if (next_offset_ < table_->file_info_.data_end_offset) { | |
680 | for (Next(); status_.ok() && Valid(); Next()) { | |
681 | if (!prefix_match) { | |
682 | // Need to verify the first key's prefix | |
683 | if (table_->GetPrefix(key()) != prefix_slice) { | |
684 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
685 | break; | |
686 | } | |
687 | prefix_match = true; | |
688 | } | |
689 | if (table_->internal_comparator_.Compare(key(), target) >= 0) { | |
690 | break; | |
691 | } | |
692 | } | |
693 | } else { | |
694 | offset_ = table_->file_info_.data_end_offset; | |
695 | } | |
696 | } | |
697 | ||
698 | void PlainTableIterator::SeekForPrev(const Slice& target) { | |
699 | assert(false); | |
700 | status_ = | |
701 | Status::NotSupported("SeekForPrev() is not supported in PlainTable"); | |
702 | } | |
703 | ||
704 | void PlainTableIterator::Next() { | |
705 | offset_ = next_offset_; | |
706 | if (offset_ < table_->file_info_.data_end_offset) { | |
707 | Slice tmp_slice; | |
708 | ParsedInternalKey parsed_key; | |
709 | status_ = | |
710 | table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); | |
711 | if (!status_.ok()) { | |
712 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
713 | } | |
714 | } | |
715 | } | |
716 | ||
717 | void PlainTableIterator::Prev() { | |
718 | assert(false); | |
719 | } | |
720 | ||
721 | Slice PlainTableIterator::key() const { | |
722 | assert(Valid()); | |
723 | return key_; | |
724 | } | |
725 | ||
726 | Slice PlainTableIterator::value() const { | |
727 | assert(Valid()); | |
728 | return value_; | |
729 | } | |
730 | ||
731 | Status PlainTableIterator::status() const { | |
732 | return status_; | |
733 | } | |
734 | ||
735 | } // namespace rocksdb | |
736 | #endif // ROCKSDB_LITE |