]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be | |
3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
4 | ||
5 | #ifndef ROCKSDB_LITE | |
6 | ||
7 | #include "table/plain_table_reader.h" | |
8 | ||
9 | #include <string> | |
10 | #include <vector> | |
11 | ||
12 | #include "db/dbformat.h" | |
13 | ||
14 | #include "rocksdb/cache.h" | |
15 | #include "rocksdb/comparator.h" | |
16 | #include "rocksdb/env.h" | |
17 | #include "rocksdb/filter_policy.h" | |
18 | #include "rocksdb/options.h" | |
19 | #include "rocksdb/statistics.h" | |
20 | ||
21 | #include "table/block.h" | |
22 | #include "table/bloom_block.h" | |
23 | #include "table/filter_block.h" | |
24 | #include "table/format.h" | |
25 | #include "table/internal_iterator.h" | |
26 | #include "table/meta_blocks.h" | |
27 | #include "table/two_level_iterator.h" | |
28 | #include "table/plain_table_factory.h" | |
29 | #include "table/plain_table_key_coding.h" | |
30 | #include "table/get_context.h" | |
31 | ||
32 | #include "monitoring/histogram.h" | |
33 | #include "monitoring/perf_context_imp.h" | |
34 | #include "util/arena.h" | |
35 | #include "util/coding.h" | |
36 | #include "util/dynamic_bloom.h" | |
37 | #include "util/hash.h" | |
38 | #include "util/murmurhash.h" | |
39 | #include "util/stop_watch.h" | |
40 | #include "util/string_util.h" | |
41 | ||
42 | namespace rocksdb { | |
43 | ||
44 | namespace { | |
45 | ||
46 | // Safely getting a uint32_t element from a char array, where, starting from | |
47 | // `base`, every 4 bytes are considered as an fixed 32 bit integer. | |
48 | inline uint32_t GetFixed32Element(const char* base, size_t offset) { | |
49 | return DecodeFixed32(base + offset * sizeof(uint32_t)); | |
50 | } | |
51 | } // namespace | |
52 | ||
53 | // Iterator to iterate IndexedTable | |
54 | class PlainTableIterator : public InternalIterator { | |
55 | public: | |
56 | explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); | |
57 | ~PlainTableIterator(); | |
58 | ||
59 | bool Valid() const override; | |
60 | ||
61 | void SeekToFirst() override; | |
62 | ||
63 | void SeekToLast() override; | |
64 | ||
65 | void Seek(const Slice& target) override; | |
66 | ||
67 | void SeekForPrev(const Slice& target) override; | |
68 | ||
69 | void Next() override; | |
70 | ||
71 | void Prev() override; | |
72 | ||
73 | Slice key() const override; | |
74 | ||
75 | Slice value() const override; | |
76 | ||
77 | Status status() const override; | |
78 | ||
79 | private: | |
80 | PlainTableReader* table_; | |
81 | PlainTableKeyDecoder decoder_; | |
82 | bool use_prefix_seek_; | |
83 | uint32_t offset_; | |
84 | uint32_t next_offset_; | |
85 | Slice key_; | |
86 | Slice value_; | |
87 | Status status_; | |
88 | // No copying allowed | |
89 | PlainTableIterator(const PlainTableIterator&) = delete; | |
90 | void operator=(const Iterator&) = delete; | |
91 | }; | |
92 | ||
93 | extern const uint64_t kPlainTableMagicNumber; | |
94 | PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, | |
95 | unique_ptr<RandomAccessFileReader>&& file, | |
96 | const EnvOptions& storage_options, | |
97 | const InternalKeyComparator& icomparator, | |
98 | EncodingType encoding_type, | |
99 | uint64_t file_size, | |
11fdf7f2 TL |
100 | const TableProperties* table_properties, |
101 | const SliceTransform* prefix_extractor) | |
7c673cae FG |
102 | : internal_comparator_(icomparator), |
103 | encoding_type_(encoding_type), | |
104 | full_scan_mode_(false), | |
105 | user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), | |
11fdf7f2 | 106 | prefix_extractor_(prefix_extractor), |
7c673cae FG |
107 | enable_bloom_(false), |
108 | bloom_(6, nullptr), | |
109 | file_info_(std::move(file), storage_options, | |
110 | static_cast<uint32_t>(table_properties->data_size)), | |
111 | ioptions_(ioptions), | |
112 | file_size_(file_size), | |
113 | table_properties_(nullptr) {} | |
114 | ||
115 | PlainTableReader::~PlainTableReader() { | |
116 | } | |
117 | ||
11fdf7f2 TL |
118 | Status PlainTableReader::Open( |
119 | const ImmutableCFOptions& ioptions, const EnvOptions& env_options, | |
120 | const InternalKeyComparator& internal_comparator, | |
121 | unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, | |
122 | unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key, | |
123 | double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, | |
124 | bool full_scan_mode, const SliceTransform* prefix_extractor) { | |
7c673cae FG |
125 | if (file_size > PlainTableIndex::kMaxFileSize) { |
126 | return Status::NotSupported("File is too large for PlainTableReader!"); | |
127 | } | |
128 | ||
129 | TableProperties* props = nullptr; | |
130 | auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, | |
11fdf7f2 TL |
131 | ioptions, &props, |
132 | true /* compression_type_missing */); | |
7c673cae FG |
133 | if (!s.ok()) { |
134 | return s; | |
135 | } | |
136 | ||
137 | assert(hash_table_ratio >= 0.0); | |
138 | auto& user_props = props->user_collected_properties; | |
139 | auto prefix_extractor_in_file = props->prefix_extractor_name; | |
140 | ||
141 | if (!full_scan_mode && | |
142 | !prefix_extractor_in_file.empty() /* old version sst file*/ | |
143 | && prefix_extractor_in_file != "nullptr") { | |
11fdf7f2 | 144 | if (!prefix_extractor) { |
7c673cae FG |
145 | return Status::InvalidArgument( |
146 | "Prefix extractor is missing when opening a PlainTable built " | |
147 | "using a prefix extractor"); | |
11fdf7f2 TL |
148 | } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) != |
149 | 0) { | |
7c673cae FG |
150 | return Status::InvalidArgument( |
151 | "Prefix extractor given doesn't match the one used to build " | |
152 | "PlainTable"); | |
153 | } | |
154 | } | |
155 | ||
156 | EncodingType encoding_type = kPlain; | |
157 | auto encoding_type_prop = | |
158 | user_props.find(PlainTablePropertyNames::kEncodingType); | |
159 | if (encoding_type_prop != user_props.end()) { | |
160 | encoding_type = static_cast<EncodingType>( | |
161 | DecodeFixed32(encoding_type_prop->second.c_str())); | |
162 | } | |
163 | ||
164 | std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( | |
165 | ioptions, std::move(file), env_options, internal_comparator, | |
11fdf7f2 | 166 | encoding_type, file_size, props, prefix_extractor)); |
7c673cae FG |
167 | |
168 | s = new_reader->MmapDataIfNeeded(); | |
169 | if (!s.ok()) { | |
170 | return s; | |
171 | } | |
172 | ||
173 | if (!full_scan_mode) { | |
174 | s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, | |
175 | index_sparseness, huge_page_tlb_size); | |
176 | if (!s.ok()) { | |
177 | return s; | |
178 | } | |
179 | } else { | |
180 | // Flag to indicate it is a full scan mode so that none of the indexes | |
181 | // can be used. | |
182 | new_reader->full_scan_mode_ = true; | |
183 | } | |
184 | ||
185 | *table_reader = std::move(new_reader); | |
186 | return s; | |
187 | } | |
188 | ||
189 | void PlainTableReader::SetupForCompaction() { | |
190 | } | |
191 | ||
11fdf7f2 TL |
192 | InternalIterator* PlainTableReader::NewIterator( |
193 | const ReadOptions& options, const SliceTransform* /* prefix_extractor */, | |
194 | Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { | |
195 | bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; | |
7c673cae | 196 | if (arena == nullptr) { |
11fdf7f2 | 197 | return new PlainTableIterator(this, use_prefix_seek); |
7c673cae FG |
198 | } else { |
199 | auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); | |
11fdf7f2 | 200 | return new (mem) PlainTableIterator(this, use_prefix_seek); |
7c673cae FG |
201 | } |
202 | } | |
203 | ||
204 | Status PlainTableReader::PopulateIndexRecordList( | |
205 | PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) { | |
206 | Slice prev_key_prefix_slice; | |
207 | std::string prev_key_prefix_buf; | |
208 | uint32_t pos = data_start_offset_; | |
209 | ||
210 | bool is_first_record = true; | |
211 | Slice key_prefix_slice; | |
212 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
11fdf7f2 | 213 | prefix_extractor_); |
7c673cae FG |
214 | while (pos < file_info_.data_end_offset) { |
215 | uint32_t key_offset = pos; | |
216 | ParsedInternalKey key; | |
217 | Slice value_slice; | |
218 | bool seekable = false; | |
219 | Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); | |
220 | if (!s.ok()) { | |
221 | return s; | |
222 | } | |
223 | ||
224 | key_prefix_slice = GetPrefix(key); | |
225 | if (enable_bloom_) { | |
226 | bloom_.AddHash(GetSliceHash(key.user_key)); | |
227 | } else { | |
228 | if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { | |
229 | if (!is_first_record) { | |
230 | prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); | |
231 | } | |
232 | if (file_info_.is_mmap_mode) { | |
233 | prev_key_prefix_slice = key_prefix_slice; | |
234 | } else { | |
235 | prev_key_prefix_buf = key_prefix_slice.ToString(); | |
236 | prev_key_prefix_slice = prev_key_prefix_buf; | |
237 | } | |
238 | } | |
239 | } | |
240 | ||
241 | index_builder->AddKeyPrefix(GetPrefix(key), key_offset); | |
242 | ||
243 | if (!seekable && is_first_record) { | |
244 | return Status::Corruption("Key for a prefix is not seekable"); | |
245 | } | |
246 | ||
247 | is_first_record = false; | |
248 | } | |
249 | ||
250 | prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); | |
251 | auto s = index_.InitFromRawData(index_builder->Finish()); | |
252 | return s; | |
253 | } | |
254 | ||
255 | void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, | |
256 | int num_prefixes, | |
257 | size_t huge_page_tlb_size, | |
258 | vector<uint32_t>* prefix_hashes) { | |
259 | if (!IsTotalOrderMode()) { | |
260 | uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; | |
261 | if (bloom_total_bits > 0) { | |
262 | enable_bloom_ = true; | |
263 | bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, | |
264 | huge_page_tlb_size, ioptions_.info_log); | |
265 | FillBloom(prefix_hashes); | |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) { | |
271 | assert(bloom_.IsInitialized()); | |
272 | for (auto prefix_hash : *prefix_hashes) { | |
273 | bloom_.AddHash(prefix_hash); | |
274 | } | |
275 | } | |
276 | ||
277 | Status PlainTableReader::MmapDataIfNeeded() { | |
278 | if (file_info_.is_mmap_mode) { | |
279 | // Get mmapped memory. | |
11fdf7f2 | 280 | return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr); |
7c673cae FG |
281 | } |
282 | return Status::OK(); | |
283 | } | |
284 | ||
285 | Status PlainTableReader::PopulateIndex(TableProperties* props, | |
286 | int bloom_bits_per_key, | |
287 | double hash_table_ratio, | |
288 | size_t index_sparseness, | |
289 | size_t huge_page_tlb_size) { | |
290 | assert(props != nullptr); | |
291 | table_properties_.reset(props); | |
292 | ||
293 | BlockContents index_block_contents; | |
11fdf7f2 TL |
294 | Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, |
295 | file_size_, kPlainTableMagicNumber, ioptions_, | |
296 | PlainTableIndexBuilder::kPlainTableIndexBlock, | |
297 | &index_block_contents, | |
298 | true /* compression_type_missing */); | |
7c673cae FG |
299 | |
300 | bool index_in_file = s.ok(); | |
301 | ||
302 | BlockContents bloom_block_contents; | |
303 | bool bloom_in_file = false; | |
304 | // We only need to read the bloom block if index block is in file. | |
305 | if (index_in_file) { | |
11fdf7f2 TL |
306 | s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, |
307 | file_size_, kPlainTableMagicNumber, ioptions_, | |
308 | BloomBlockBuilder::kBloomBlock, &bloom_block_contents, | |
309 | true /* compression_type_missing */); | |
7c673cae FG |
310 | bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; |
311 | } | |
312 | ||
313 | Slice* bloom_block; | |
314 | if (bloom_in_file) { | |
315 | // If bloom_block_contents.allocation is not empty (which will be the case | |
316 | // for non-mmap mode), it holds the alloated memory for the bloom block. | |
317 | // It needs to be kept alive to keep `bloom_block` valid. | |
318 | bloom_block_alloc_ = std::move(bloom_block_contents.allocation); | |
319 | bloom_block = &bloom_block_contents.data; | |
320 | } else { | |
321 | bloom_block = nullptr; | |
322 | } | |
323 | ||
324 | Slice* index_block; | |
325 | if (index_in_file) { | |
326 | // If index_block_contents.allocation is not empty (which will be the case | |
327 | // for non-mmap mode), it holds the alloated memory for the index block. | |
328 | // It needs to be kept alive to keep `index_block` valid. | |
329 | index_block_alloc_ = std::move(index_block_contents.allocation); | |
330 | index_block = &index_block_contents.data; | |
331 | } else { | |
332 | index_block = nullptr; | |
333 | } | |
334 | ||
11fdf7f2 TL |
335 | if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) { |
336 | // moptions.prefix_extractor is requried for a hash-based look-up. | |
7c673cae FG |
337 | return Status::NotSupported( |
338 | "PlainTable requires a prefix extractor enable prefix hash mode."); | |
339 | } | |
340 | ||
341 | // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows | |
342 | // for a prefix (starting from the first one), generate a record of (hash, | |
343 | // offset) and append it to IndexRecordList, which is a data structure created | |
344 | // to store them. | |
345 | ||
346 | if (!index_in_file) { | |
347 | // Allocate bloom filter here for total order mode. | |
348 | if (IsTotalOrderMode()) { | |
349 | uint32_t num_bloom_bits = | |
350 | static_cast<uint32_t>(table_properties_->num_entries) * | |
351 | bloom_bits_per_key; | |
352 | if (num_bloom_bits > 0) { | |
353 | enable_bloom_ = true; | |
354 | bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, | |
355 | huge_page_tlb_size, ioptions_.info_log); | |
356 | } | |
357 | } | |
358 | } else if (bloom_in_file) { | |
359 | enable_bloom_ = true; | |
360 | auto num_blocks_property = props->user_collected_properties.find( | |
361 | PlainTablePropertyNames::kNumBloomBlocks); | |
362 | ||
363 | uint32_t num_blocks = 0; | |
364 | if (num_blocks_property != props->user_collected_properties.end()) { | |
365 | Slice temp_slice(num_blocks_property->second); | |
366 | if (!GetVarint32(&temp_slice, &num_blocks)) { | |
367 | num_blocks = 0; | |
368 | } | |
369 | } | |
370 | // cast away const qualifier, because bloom_ won't be changed | |
371 | bloom_.SetRawData( | |
372 | const_cast<unsigned char*>( | |
373 | reinterpret_cast<const unsigned char*>(bloom_block->data())), | |
374 | static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks); | |
375 | } else { | |
376 | // Index in file but no bloom in file. Disable bloom filter in this case. | |
377 | enable_bloom_ = false; | |
378 | bloom_bits_per_key = 0; | |
379 | } | |
380 | ||
11fdf7f2 TL |
381 | PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_, |
382 | index_sparseness, hash_table_ratio, | |
383 | huge_page_tlb_size); | |
7c673cae FG |
384 | |
385 | std::vector<uint32_t> prefix_hashes; | |
386 | if (!index_in_file) { | |
387 | s = PopulateIndexRecordList(&index_builder, &prefix_hashes); | |
388 | if (!s.ok()) { | |
389 | return s; | |
390 | } | |
391 | } else { | |
392 | s = index_.InitFromRawData(*index_block); | |
393 | if (!s.ok()) { | |
394 | return s; | |
395 | } | |
396 | } | |
397 | ||
398 | if (!index_in_file) { | |
399 | // Calculated bloom filter size and allocate memory for | |
400 | // bloom filter based on the number of prefixes, then fill it. | |
401 | AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), | |
402 | huge_page_tlb_size, &prefix_hashes); | |
403 | } | |
404 | ||
405 | // Fill two table properties. | |
406 | if (!index_in_file) { | |
407 | props->user_collected_properties["plain_table_hash_table_size"] = | |
408 | ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); | |
409 | props->user_collected_properties["plain_table_sub_index_size"] = | |
410 | ToString(index_.GetSubIndexSize()); | |
411 | } else { | |
412 | props->user_collected_properties["plain_table_hash_table_size"] = | |
413 | ToString(0); | |
414 | props->user_collected_properties["plain_table_sub_index_size"] = | |
415 | ToString(0); | |
416 | } | |
417 | ||
418 | return Status::OK(); | |
419 | } | |
420 | ||
421 | Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, | |
422 | const Slice& target, const Slice& prefix, | |
423 | uint32_t prefix_hash, bool& prefix_matched, | |
424 | uint32_t* offset) const { | |
425 | prefix_matched = false; | |
426 | uint32_t prefix_index_offset; | |
427 | auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); | |
428 | if (res == PlainTableIndex::kNoPrefixForBucket) { | |
429 | *offset = file_info_.data_end_offset; | |
430 | return Status::OK(); | |
431 | } else if (res == PlainTableIndex::kDirectToFile) { | |
432 | *offset = prefix_index_offset; | |
433 | return Status::OK(); | |
434 | } | |
435 | ||
436 | // point to sub-index, need to do a binary search | |
437 | uint32_t upper_bound; | |
438 | const char* base_ptr = | |
439 | index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); | |
440 | uint32_t low = 0; | |
441 | uint32_t high = upper_bound; | |
442 | ParsedInternalKey mid_key; | |
443 | ParsedInternalKey parsed_target; | |
444 | if (!ParseInternalKey(target, &parsed_target)) { | |
445 | return Status::Corruption(Slice()); | |
446 | } | |
447 | ||
448 | // The key is between [low, high). Do a binary search between it. | |
449 | while (high - low > 1) { | |
450 | uint32_t mid = (high + low) / 2; | |
451 | uint32_t file_offset = GetFixed32Element(base_ptr, mid); | |
452 | uint32_t tmp; | |
453 | Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); | |
454 | if (!s.ok()) { | |
455 | return s; | |
456 | } | |
457 | int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); | |
458 | if (cmp_result < 0) { | |
459 | low = mid; | |
460 | } else { | |
461 | if (cmp_result == 0) { | |
462 | // Happen to have found the exact key or target is smaller than the | |
463 | // first key after base_offset. | |
464 | prefix_matched = true; | |
465 | *offset = file_offset; | |
466 | return Status::OK(); | |
467 | } else { | |
468 | high = mid; | |
469 | } | |
470 | } | |
471 | } | |
472 | // Both of the key at the position low or low+1 could share the same | |
473 | // prefix as target. We need to rule out one of them to avoid to go | |
474 | // to the wrong prefix. | |
475 | ParsedInternalKey low_key; | |
476 | uint32_t tmp; | |
477 | uint32_t low_key_offset = GetFixed32Element(base_ptr, low); | |
478 | Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); | |
479 | if (!s.ok()) { | |
480 | return s; | |
481 | } | |
482 | ||
483 | if (GetPrefix(low_key) == prefix) { | |
484 | prefix_matched = true; | |
485 | *offset = low_key_offset; | |
486 | } else if (low + 1 < upper_bound) { | |
487 | // There is possible a next prefix, return it | |
488 | prefix_matched = false; | |
489 | *offset = GetFixed32Element(base_ptr, low + 1); | |
490 | } else { | |
491 | // target is larger than a key of the last prefix in this bucket | |
492 | // but with a different prefix. Key does not exist. | |
493 | *offset = file_info_.data_end_offset; | |
494 | } | |
495 | return Status::OK(); | |
496 | } | |
497 | ||
498 | bool PlainTableReader::MatchBloom(uint32_t hash) const { | |
499 | if (!enable_bloom_) { | |
500 | return true; | |
501 | } | |
502 | ||
503 | if (bloom_.MayContainHash(hash)) { | |
504 | PERF_COUNTER_ADD(bloom_sst_hit_count, 1); | |
505 | return true; | |
506 | } else { | |
507 | PERF_COUNTER_ADD(bloom_sst_miss_count, 1); | |
508 | return false; | |
509 | } | |
510 | } | |
511 | ||
512 | Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, | |
513 | ParsedInternalKey* parsed_key, | |
514 | Slice* internal_key, Slice* value, | |
515 | bool* seekable) const { | |
516 | if (*offset == file_info_.data_end_offset) { | |
517 | *offset = file_info_.data_end_offset; | |
518 | return Status::OK(); | |
519 | } | |
520 | ||
521 | if (*offset > file_info_.data_end_offset) { | |
522 | return Status::Corruption("Offset is out of file size"); | |
523 | } | |
524 | ||
525 | uint32_t bytes_read; | |
526 | Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, | |
527 | &bytes_read, seekable); | |
528 | if (!s.ok()) { | |
529 | return s; | |
530 | } | |
531 | *offset = *offset + bytes_read; | |
532 | return Status::OK(); | |
533 | } | |
534 | ||
535 | void PlainTableReader::Prepare(const Slice& target) { | |
536 | if (enable_bloom_) { | |
537 | uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); | |
538 | bloom_.Prefetch(prefix_hash); | |
539 | } | |
540 | } | |
541 | ||
11fdf7f2 TL |
542 | Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, |
543 | GetContext* get_context, | |
544 | const SliceTransform* /* prefix_extractor */, | |
545 | bool /*skip_filters*/) { | |
7c673cae FG |
546 | // Check bloom filter first. |
547 | Slice prefix_slice; | |
548 | uint32_t prefix_hash; | |
549 | if (IsTotalOrderMode()) { | |
550 | if (full_scan_mode_) { | |
551 | status_ = | |
552 | Status::InvalidArgument("Get() is not allowed in full scan mode."); | |
553 | } | |
554 | // Match whole user key for bloom filter check. | |
555 | if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { | |
556 | return Status::OK(); | |
557 | } | |
558 | // in total order mode, there is only one bucket 0, and we always use empty | |
559 | // prefix. | |
560 | prefix_slice = Slice(); | |
561 | prefix_hash = 0; | |
562 | } else { | |
563 | prefix_slice = GetPrefix(target); | |
564 | prefix_hash = GetSliceHash(prefix_slice); | |
565 | if (!MatchBloom(prefix_hash)) { | |
566 | return Status::OK(); | |
567 | } | |
568 | } | |
569 | uint32_t offset; | |
570 | bool prefix_match; | |
571 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
11fdf7f2 | 572 | prefix_extractor_); |
7c673cae FG |
573 | Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, |
574 | prefix_match, &offset); | |
575 | ||
576 | if (!s.ok()) { | |
577 | return s; | |
578 | } | |
579 | ParsedInternalKey found_key; | |
580 | ParsedInternalKey parsed_target; | |
581 | if (!ParseInternalKey(target, &parsed_target)) { | |
582 | return Status::Corruption(Slice()); | |
583 | } | |
584 | Slice found_value; | |
585 | while (offset < file_info_.data_end_offset) { | |
586 | s = Next(&decoder, &offset, &found_key, nullptr, &found_value); | |
587 | if (!s.ok()) { | |
588 | return s; | |
589 | } | |
590 | if (!prefix_match) { | |
591 | // Need to verify prefix for the first key found if it is not yet | |
592 | // checked. | |
593 | if (GetPrefix(found_key) != prefix_slice) { | |
594 | return Status::OK(); | |
595 | } | |
596 | prefix_match = true; | |
597 | } | |
598 | // TODO(ljin): since we know the key comparison result here, | |
599 | // can we enable the fast path? | |
600 | if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { | |
11fdf7f2 TL |
601 | bool dont_care __attribute__((__unused__)); |
602 | if (!get_context->SaveValue(found_key, found_value, &dont_care)) { | |
7c673cae FG |
603 | break; |
604 | } | |
605 | } | |
606 | } | |
607 | return Status::OK(); | |
608 | } | |
609 | ||
11fdf7f2 | 610 | uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) { |
7c673cae FG |
611 | return 0; |
612 | } | |
613 | ||
614 | PlainTableIterator::PlainTableIterator(PlainTableReader* table, | |
615 | bool use_prefix_seek) | |
616 | : table_(table), | |
617 | decoder_(&table_->file_info_, table_->encoding_type_, | |
618 | table_->user_key_len_, table_->prefix_extractor_), | |
619 | use_prefix_seek_(use_prefix_seek) { | |
620 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
621 | } | |
622 | ||
623 | PlainTableIterator::~PlainTableIterator() { | |
624 | } | |
625 | ||
626 | bool PlainTableIterator::Valid() const { | |
627 | return offset_ < table_->file_info_.data_end_offset && | |
628 | offset_ >= table_->data_start_offset_; | |
629 | } | |
630 | ||
631 | void PlainTableIterator::SeekToFirst() { | |
11fdf7f2 | 632 | status_ = Status::OK(); |
7c673cae FG |
633 | next_offset_ = table_->data_start_offset_; |
634 | if (next_offset_ >= table_->file_info_.data_end_offset) { | |
635 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
636 | } else { | |
637 | Next(); | |
638 | } | |
639 | } | |
640 | ||
641 | void PlainTableIterator::SeekToLast() { | |
642 | assert(false); | |
643 | status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); | |
11fdf7f2 | 644 | next_offset_ = offset_ = table_->file_info_.data_end_offset; |
7c673cae FG |
645 | } |
646 | ||
647 | void PlainTableIterator::Seek(const Slice& target) { | |
11fdf7f2 TL |
648 | if (use_prefix_seek_ != !table_->IsTotalOrderMode()) { |
649 | // This check is done here instead of NewIterator() to permit creating an | |
650 | // iterator with total_order_seek = true even if we won't be able to Seek() | |
651 | // it. This is needed for compaction: it creates iterator with | |
652 | // total_order_seek = true but usually never does Seek() on it, | |
653 | // only SeekToFirst(). | |
654 | status_ = | |
655 | Status::InvalidArgument( | |
656 | "total_order_seek not implemented for PlainTable."); | |
657 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
658 | return; | |
659 | } | |
660 | ||
7c673cae FG |
661 | // If the user doesn't set prefix seek option and we are not able to do a |
662 | // total Seek(). assert failure. | |
11fdf7f2 | 663 | if (table_->IsTotalOrderMode()) { |
7c673cae FG |
664 | if (table_->full_scan_mode_) { |
665 | status_ = | |
666 | Status::InvalidArgument("Seek() is not allowed in full scan mode."); | |
667 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
668 | return; | |
669 | } else if (table_->GetIndexSize() > 1) { | |
670 | assert(false); | |
671 | status_ = Status::NotSupported( | |
672 | "PlainTable cannot issue non-prefix seek unless in total order " | |
673 | "mode."); | |
674 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
675 | return; | |
676 | } | |
677 | } | |
678 | ||
679 | Slice prefix_slice = table_->GetPrefix(target); | |
680 | uint32_t prefix_hash = 0; | |
681 | // Bloom filter is ignored in total-order mode. | |
682 | if (!table_->IsTotalOrderMode()) { | |
683 | prefix_hash = GetSliceHash(prefix_slice); | |
684 | if (!table_->MatchBloom(prefix_hash)) { | |
11fdf7f2 | 685 | status_ = Status::OK(); |
7c673cae FG |
686 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
687 | return; | |
688 | } | |
689 | } | |
690 | bool prefix_match; | |
691 | status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, | |
692 | prefix_match, &next_offset_); | |
693 | if (!status_.ok()) { | |
694 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
695 | return; | |
696 | } | |
697 | ||
698 | if (next_offset_ < table_->file_info_.data_end_offset) { | |
699 | for (Next(); status_.ok() && Valid(); Next()) { | |
700 | if (!prefix_match) { | |
701 | // Need to verify the first key's prefix | |
702 | if (table_->GetPrefix(key()) != prefix_slice) { | |
703 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
704 | break; | |
705 | } | |
706 | prefix_match = true; | |
707 | } | |
708 | if (table_->internal_comparator_.Compare(key(), target) >= 0) { | |
709 | break; | |
710 | } | |
711 | } | |
712 | } else { | |
713 | offset_ = table_->file_info_.data_end_offset; | |
714 | } | |
715 | } | |
716 | ||
11fdf7f2 | 717 | void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { |
7c673cae FG |
718 | assert(false); |
719 | status_ = | |
720 | Status::NotSupported("SeekForPrev() is not supported in PlainTable"); | |
11fdf7f2 | 721 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
7c673cae FG |
722 | } |
723 | ||
724 | void PlainTableIterator::Next() { | |
725 | offset_ = next_offset_; | |
726 | if (offset_ < table_->file_info_.data_end_offset) { | |
727 | Slice tmp_slice; | |
728 | ParsedInternalKey parsed_key; | |
729 | status_ = | |
730 | table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); | |
731 | if (!status_.ok()) { | |
732 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
733 | } | |
734 | } | |
735 | } | |
736 | ||
737 | void PlainTableIterator::Prev() { | |
738 | assert(false); | |
739 | } | |
740 | ||
741 | Slice PlainTableIterator::key() const { | |
742 | assert(Valid()); | |
743 | return key_; | |
744 | } | |
745 | ||
746 | Slice PlainTableIterator::value() const { | |
747 | assert(Valid()); | |
748 | return value_; | |
749 | } | |
750 | ||
751 | Status PlainTableIterator::status() const { | |
752 | return status_; | |
753 | } | |
754 | ||
755 | } // namespace rocksdb | |
756 | #endif // ROCKSDB_LITE |