]>
Commit | Line | Data |
---|---|---|
f67539c2 | 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
7c673cae FG |
2 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
3 | // Use of this source code is governed by a BSD-style license that can be | |
4 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
5 | ||
6 | #ifndef ROCKSDB_LITE | |
7 | ||
f67539c2 | 8 | #include "table/plain/plain_table_reader.h" |
7c673cae FG |
9 | |
10 | #include <string> | |
11 | #include <vector> | |
12 | ||
13 | #include "db/dbformat.h" | |
14 | ||
15 | #include "rocksdb/cache.h" | |
16 | #include "rocksdb/comparator.h" | |
17 | #include "rocksdb/env.h" | |
18 | #include "rocksdb/filter_policy.h" | |
19 | #include "rocksdb/options.h" | |
20 | #include "rocksdb/statistics.h" | |
21 | ||
f67539c2 TL |
22 | #include "table/block_based/block.h" |
23 | #include "table/block_based/filter_block.h" | |
7c673cae | 24 | #include "table/format.h" |
f67539c2 | 25 | #include "table/get_context.h" |
7c673cae FG |
26 | #include "table/internal_iterator.h" |
27 | #include "table/meta_blocks.h" | |
f67539c2 TL |
28 | #include "table/plain/plain_table_bloom.h" |
29 | #include "table/plain/plain_table_factory.h" | |
30 | #include "table/plain/plain_table_key_coding.h" | |
7c673cae | 31 | #include "table/two_level_iterator.h" |
7c673cae | 32 | |
f67539c2 | 33 | #include "memory/arena.h" |
7c673cae FG |
34 | #include "monitoring/histogram.h" |
35 | #include "monitoring/perf_context_imp.h" | |
7c673cae FG |
36 | #include "util/coding.h" |
37 | #include "util/dynamic_bloom.h" | |
38 | #include "util/hash.h" | |
7c673cae FG |
39 | #include "util/stop_watch.h" |
40 | #include "util/string_util.h" | |
41 | ||
f67539c2 | 42 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
43 | |
44 | namespace { | |
45 | ||
46 | // Safely getting a uint32_t element from a char array, where, starting from | |
47 | // `base`, every 4 bytes are considered as an fixed 32 bit integer. | |
48 | inline uint32_t GetFixed32Element(const char* base, size_t offset) { | |
49 | return DecodeFixed32(base + offset * sizeof(uint32_t)); | |
50 | } | |
51 | } // namespace | |
52 | ||
53 | // Iterator to iterate IndexedTable | |
54 | class PlainTableIterator : public InternalIterator { | |
55 | public: | |
56 | explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); | |
f67539c2 TL |
57 | // No copying allowed |
58 | PlainTableIterator(const PlainTableIterator&) = delete; | |
59 | void operator=(const Iterator&) = delete; | |
60 | ||
494da23a | 61 | ~PlainTableIterator() override; |
7c673cae FG |
62 | |
63 | bool Valid() const override; | |
64 | ||
65 | void SeekToFirst() override; | |
66 | ||
67 | void SeekToLast() override; | |
68 | ||
69 | void Seek(const Slice& target) override; | |
70 | ||
71 | void SeekForPrev(const Slice& target) override; | |
72 | ||
73 | void Next() override; | |
74 | ||
75 | void Prev() override; | |
76 | ||
77 | Slice key() const override; | |
78 | ||
79 | Slice value() const override; | |
80 | ||
81 | Status status() const override; | |
82 | ||
83 | private: | |
84 | PlainTableReader* table_; | |
85 | PlainTableKeyDecoder decoder_; | |
86 | bool use_prefix_seek_; | |
87 | uint32_t offset_; | |
88 | uint32_t next_offset_; | |
89 | Slice key_; | |
90 | Slice value_; | |
91 | Status status_; | |
7c673cae FG |
92 | }; |
93 | ||
94 | extern const uint64_t kPlainTableMagicNumber; | |
494da23a TL |
95 | PlainTableReader::PlainTableReader( |
96 | const ImmutableCFOptions& ioptions, | |
97 | std::unique_ptr<RandomAccessFileReader>&& file, | |
98 | const EnvOptions& storage_options, const InternalKeyComparator& icomparator, | |
99 | EncodingType encoding_type, uint64_t file_size, | |
100 | const TableProperties* table_properties, | |
101 | const SliceTransform* prefix_extractor) | |
7c673cae FG |
102 | : internal_comparator_(icomparator), |
103 | encoding_type_(encoding_type), | |
104 | full_scan_mode_(false), | |
105 | user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), | |
11fdf7f2 | 106 | prefix_extractor_(prefix_extractor), |
7c673cae | 107 | enable_bloom_(false), |
494da23a | 108 | bloom_(6), |
7c673cae FG |
109 | file_info_(std::move(file), storage_options, |
110 | static_cast<uint32_t>(table_properties->data_size)), | |
111 | ioptions_(ioptions), | |
112 | file_size_(file_size), | |
113 | table_properties_(nullptr) {} | |
114 | ||
115 | PlainTableReader::~PlainTableReader() { | |
20effc67 TL |
116 | // Should fix? |
117 | status_.PermitUncheckedError(); | |
7c673cae FG |
118 | } |
119 | ||
11fdf7f2 TL |
120 | Status PlainTableReader::Open( |
121 | const ImmutableCFOptions& ioptions, const EnvOptions& env_options, | |
122 | const InternalKeyComparator& internal_comparator, | |
494da23a TL |
123 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
124 | std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key, | |
11fdf7f2 | 125 | double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, |
494da23a TL |
126 | bool full_scan_mode, const bool immortal_table, |
127 | const SliceTransform* prefix_extractor) { | |
7c673cae FG |
128 | if (file_size > PlainTableIndex::kMaxFileSize) { |
129 | return Status::NotSupported("File is too large for PlainTableReader!"); | |
130 | } | |
131 | ||
f67539c2 | 132 | TableProperties* props_ptr = nullptr; |
7c673cae | 133 | auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, |
f67539c2 | 134 | ioptions, &props_ptr, |
11fdf7f2 | 135 | true /* compression_type_missing */); |
f67539c2 | 136 | std::shared_ptr<TableProperties> props(props_ptr); |
7c673cae FG |
137 | if (!s.ok()) { |
138 | return s; | |
139 | } | |
140 | ||
141 | assert(hash_table_ratio >= 0.0); | |
142 | auto& user_props = props->user_collected_properties; | |
143 | auto prefix_extractor_in_file = props->prefix_extractor_name; | |
144 | ||
145 | if (!full_scan_mode && | |
146 | !prefix_extractor_in_file.empty() /* old version sst file*/ | |
147 | && prefix_extractor_in_file != "nullptr") { | |
11fdf7f2 | 148 | if (!prefix_extractor) { |
7c673cae FG |
149 | return Status::InvalidArgument( |
150 | "Prefix extractor is missing when opening a PlainTable built " | |
151 | "using a prefix extractor"); | |
11fdf7f2 TL |
152 | } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) != |
153 | 0) { | |
7c673cae FG |
154 | return Status::InvalidArgument( |
155 | "Prefix extractor given doesn't match the one used to build " | |
156 | "PlainTable"); | |
157 | } | |
158 | } | |
159 | ||
160 | EncodingType encoding_type = kPlain; | |
161 | auto encoding_type_prop = | |
162 | user_props.find(PlainTablePropertyNames::kEncodingType); | |
163 | if (encoding_type_prop != user_props.end()) { | |
164 | encoding_type = static_cast<EncodingType>( | |
165 | DecodeFixed32(encoding_type_prop->second.c_str())); | |
166 | } | |
167 | ||
168 | std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( | |
169 | ioptions, std::move(file), env_options, internal_comparator, | |
f67539c2 | 170 | encoding_type, file_size, props.get(), prefix_extractor)); |
7c673cae FG |
171 | |
172 | s = new_reader->MmapDataIfNeeded(); | |
173 | if (!s.ok()) { | |
174 | return s; | |
175 | } | |
176 | ||
177 | if (!full_scan_mode) { | |
f67539c2 TL |
178 | s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key, |
179 | hash_table_ratio, index_sparseness, | |
180 | huge_page_tlb_size); | |
7c673cae FG |
181 | if (!s.ok()) { |
182 | return s; | |
183 | } | |
184 | } else { | |
185 | // Flag to indicate it is a full scan mode so that none of the indexes | |
186 | // can be used. | |
187 | new_reader->full_scan_mode_ = true; | |
188 | } | |
f67539c2 TL |
189 | // PopulateIndex can add to the props, so don't store them until now |
190 | new_reader->table_properties_ = props; | |
7c673cae | 191 | |
494da23a TL |
192 | if (immortal_table && new_reader->file_info_.is_mmap_mode) { |
193 | new_reader->dummy_cleanable_.reset(new Cleanable()); | |
194 | } | |
195 | ||
7c673cae FG |
196 | *table_reader = std::move(new_reader); |
197 | return s; | |
198 | } | |
199 | ||
200 | void PlainTableReader::SetupForCompaction() { | |
201 | } | |
202 | ||
11fdf7f2 TL |
203 | InternalIterator* PlainTableReader::NewIterator( |
204 | const ReadOptions& options, const SliceTransform* /* prefix_extractor */, | |
f67539c2 | 205 | Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, |
20effc67 TL |
206 | size_t /*compaction_readahead_size*/, |
207 | bool /* allow_unprepared_value */) { | |
f67539c2 TL |
208 | // Not necessarily used here, but make sure this has been initialized |
209 | assert(table_properties_); | |
210 | ||
211 | // Auto prefix mode is not implemented in PlainTable. | |
212 | bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && | |
213 | !options.auto_prefix_mode; | |
7c673cae | 214 | if (arena == nullptr) { |
11fdf7f2 | 215 | return new PlainTableIterator(this, use_prefix_seek); |
7c673cae FG |
216 | } else { |
217 | auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); | |
11fdf7f2 | 218 | return new (mem) PlainTableIterator(this, use_prefix_seek); |
7c673cae FG |
219 | } |
220 | } | |
221 | ||
222 | Status PlainTableReader::PopulateIndexRecordList( | |
494da23a TL |
223 | PlainTableIndexBuilder* index_builder, |
224 | std::vector<uint32_t>* prefix_hashes) { | |
7c673cae FG |
225 | Slice prev_key_prefix_slice; |
226 | std::string prev_key_prefix_buf; | |
227 | uint32_t pos = data_start_offset_; | |
228 | ||
229 | bool is_first_record = true; | |
230 | Slice key_prefix_slice; | |
231 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
11fdf7f2 | 232 | prefix_extractor_); |
7c673cae FG |
233 | while (pos < file_info_.data_end_offset) { |
234 | uint32_t key_offset = pos; | |
235 | ParsedInternalKey key; | |
236 | Slice value_slice; | |
237 | bool seekable = false; | |
238 | Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); | |
239 | if (!s.ok()) { | |
240 | return s; | |
241 | } | |
242 | ||
243 | key_prefix_slice = GetPrefix(key); | |
244 | if (enable_bloom_) { | |
245 | bloom_.AddHash(GetSliceHash(key.user_key)); | |
246 | } else { | |
247 | if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { | |
248 | if (!is_first_record) { | |
249 | prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); | |
250 | } | |
251 | if (file_info_.is_mmap_mode) { | |
252 | prev_key_prefix_slice = key_prefix_slice; | |
253 | } else { | |
254 | prev_key_prefix_buf = key_prefix_slice.ToString(); | |
255 | prev_key_prefix_slice = prev_key_prefix_buf; | |
256 | } | |
257 | } | |
258 | } | |
259 | ||
260 | index_builder->AddKeyPrefix(GetPrefix(key), key_offset); | |
261 | ||
262 | if (!seekable && is_first_record) { | |
263 | return Status::Corruption("Key for a prefix is not seekable"); | |
264 | } | |
265 | ||
266 | is_first_record = false; | |
267 | } | |
268 | ||
269 | prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); | |
270 | auto s = index_.InitFromRawData(index_builder->Finish()); | |
271 | return s; | |
272 | } | |
273 | ||
f67539c2 TL |
274 | void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys, |
275 | size_t huge_page_tlb_size) { | |
276 | uint32_t bloom_total_bits = num_keys * bloom_bits_per_key; | |
277 | if (bloom_total_bits > 0) { | |
278 | enable_bloom_ = true; | |
279 | bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, | |
280 | huge_page_tlb_size, ioptions_.info_log); | |
7c673cae FG |
281 | } |
282 | } | |
283 | ||
f67539c2 | 284 | void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) { |
7c673cae | 285 | assert(bloom_.IsInitialized()); |
f67539c2 | 286 | for (const auto prefix_hash : prefix_hashes) { |
7c673cae FG |
287 | bloom_.AddHash(prefix_hash); |
288 | } | |
289 | } | |
290 | ||
291 | Status PlainTableReader::MmapDataIfNeeded() { | |
292 | if (file_info_.is_mmap_mode) { | |
293 | // Get mmapped memory. | |
20effc67 TL |
294 | return file_info_.file->Read(IOOptions(), 0, |
295 | static_cast<size_t>(file_size_), | |
296 | &file_info_.file_data, nullptr, nullptr); | |
7c673cae FG |
297 | } |
298 | return Status::OK(); | |
299 | } | |
300 | ||
301 | Status PlainTableReader::PopulateIndex(TableProperties* props, | |
302 | int bloom_bits_per_key, | |
303 | double hash_table_ratio, | |
304 | size_t index_sparseness, | |
305 | size_t huge_page_tlb_size) { | |
306 | assert(props != nullptr); | |
7c673cae FG |
307 | |
308 | BlockContents index_block_contents; | |
11fdf7f2 TL |
309 | Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, |
310 | file_size_, kPlainTableMagicNumber, ioptions_, | |
311 | PlainTableIndexBuilder::kPlainTableIndexBlock, | |
f67539c2 | 312 | BlockType::kIndex, &index_block_contents, |
11fdf7f2 | 313 | true /* compression_type_missing */); |
7c673cae FG |
314 | |
315 | bool index_in_file = s.ok(); | |
316 | ||
317 | BlockContents bloom_block_contents; | |
318 | bool bloom_in_file = false; | |
319 | // We only need to read the bloom block if index block is in file. | |
320 | if (index_in_file) { | |
11fdf7f2 TL |
321 | s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, |
322 | file_size_, kPlainTableMagicNumber, ioptions_, | |
f67539c2 TL |
323 | BloomBlockBuilder::kBloomBlock, BlockType::kFilter, |
324 | &bloom_block_contents, | |
11fdf7f2 | 325 | true /* compression_type_missing */); |
7c673cae FG |
326 | bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; |
327 | } | |
328 | ||
329 | Slice* bloom_block; | |
330 | if (bloom_in_file) { | |
331 | // If bloom_block_contents.allocation is not empty (which will be the case | |
332 | // for non-mmap mode), it holds the alloated memory for the bloom block. | |
333 | // It needs to be kept alive to keep `bloom_block` valid. | |
334 | bloom_block_alloc_ = std::move(bloom_block_contents.allocation); | |
335 | bloom_block = &bloom_block_contents.data; | |
336 | } else { | |
337 | bloom_block = nullptr; | |
338 | } | |
339 | ||
340 | Slice* index_block; | |
341 | if (index_in_file) { | |
342 | // If index_block_contents.allocation is not empty (which will be the case | |
343 | // for non-mmap mode), it holds the alloated memory for the index block. | |
344 | // It needs to be kept alive to keep `index_block` valid. | |
345 | index_block_alloc_ = std::move(index_block_contents.allocation); | |
346 | index_block = &index_block_contents.data; | |
347 | } else { | |
348 | index_block = nullptr; | |
349 | } | |
350 | ||
11fdf7f2 TL |
351 | if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) { |
352 | // moptions.prefix_extractor is requried for a hash-based look-up. | |
7c673cae FG |
353 | return Status::NotSupported( |
354 | "PlainTable requires a prefix extractor enable prefix hash mode."); | |
355 | } | |
356 | ||
357 | // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows | |
358 | // for a prefix (starting from the first one), generate a record of (hash, | |
359 | // offset) and append it to IndexRecordList, which is a data structure created | |
360 | // to store them. | |
361 | ||
362 | if (!index_in_file) { | |
363 | // Allocate bloom filter here for total order mode. | |
364 | if (IsTotalOrderMode()) { | |
f67539c2 TL |
365 | AllocateBloom(bloom_bits_per_key, |
366 | static_cast<uint32_t>(props->num_entries), | |
367 | huge_page_tlb_size); | |
7c673cae FG |
368 | } |
369 | } else if (bloom_in_file) { | |
370 | enable_bloom_ = true; | |
371 | auto num_blocks_property = props->user_collected_properties.find( | |
372 | PlainTablePropertyNames::kNumBloomBlocks); | |
373 | ||
374 | uint32_t num_blocks = 0; | |
375 | if (num_blocks_property != props->user_collected_properties.end()) { | |
376 | Slice temp_slice(num_blocks_property->second); | |
377 | if (!GetVarint32(&temp_slice, &num_blocks)) { | |
378 | num_blocks = 0; | |
379 | } | |
380 | } | |
381 | // cast away const qualifier, because bloom_ won't be changed | |
f67539c2 TL |
382 | bloom_.SetRawData(const_cast<char*>(bloom_block->data()), |
383 | static_cast<uint32_t>(bloom_block->size()) * 8, | |
384 | num_blocks); | |
7c673cae FG |
385 | } else { |
386 | // Index in file but no bloom in file. Disable bloom filter in this case. | |
387 | enable_bloom_ = false; | |
388 | bloom_bits_per_key = 0; | |
389 | } | |
390 | ||
11fdf7f2 TL |
391 | PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_, |
392 | index_sparseness, hash_table_ratio, | |
393 | huge_page_tlb_size); | |
7c673cae FG |
394 | |
395 | std::vector<uint32_t> prefix_hashes; | |
396 | if (!index_in_file) { | |
f67539c2 | 397 | // Populates _bloom if enabled (total order mode) |
7c673cae FG |
398 | s = PopulateIndexRecordList(&index_builder, &prefix_hashes); |
399 | if (!s.ok()) { | |
400 | return s; | |
401 | } | |
402 | } else { | |
403 | s = index_.InitFromRawData(*index_block); | |
404 | if (!s.ok()) { | |
405 | return s; | |
406 | } | |
407 | } | |
408 | ||
409 | if (!index_in_file) { | |
f67539c2 TL |
410 | if (!IsTotalOrderMode()) { |
411 | // Calculated bloom filter size and allocate memory for | |
412 | // bloom filter based on the number of prefixes, then fill it. | |
413 | AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(), | |
414 | huge_page_tlb_size); | |
415 | if (enable_bloom_) { | |
416 | FillBloom(prefix_hashes); | |
417 | } | |
418 | } | |
7c673cae FG |
419 | } |
420 | ||
421 | // Fill two table properties. | |
422 | if (!index_in_file) { | |
423 | props->user_collected_properties["plain_table_hash_table_size"] = | |
424 | ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); | |
425 | props->user_collected_properties["plain_table_sub_index_size"] = | |
426 | ToString(index_.GetSubIndexSize()); | |
427 | } else { | |
428 | props->user_collected_properties["plain_table_hash_table_size"] = | |
429 | ToString(0); | |
430 | props->user_collected_properties["plain_table_sub_index_size"] = | |
431 | ToString(0); | |
432 | } | |
433 | ||
434 | return Status::OK(); | |
435 | } | |
436 | ||
437 | Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, | |
438 | const Slice& target, const Slice& prefix, | |
439 | uint32_t prefix_hash, bool& prefix_matched, | |
440 | uint32_t* offset) const { | |
441 | prefix_matched = false; | |
442 | uint32_t prefix_index_offset; | |
443 | auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); | |
444 | if (res == PlainTableIndex::kNoPrefixForBucket) { | |
445 | *offset = file_info_.data_end_offset; | |
446 | return Status::OK(); | |
447 | } else if (res == PlainTableIndex::kDirectToFile) { | |
448 | *offset = prefix_index_offset; | |
449 | return Status::OK(); | |
450 | } | |
451 | ||
452 | // point to sub-index, need to do a binary search | |
20effc67 | 453 | uint32_t upper_bound = 0; |
7c673cae FG |
454 | const char* base_ptr = |
455 | index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); | |
456 | uint32_t low = 0; | |
457 | uint32_t high = upper_bound; | |
458 | ParsedInternalKey mid_key; | |
459 | ParsedInternalKey parsed_target; | |
20effc67 TL |
460 | Status s = ParseInternalKey(target, &parsed_target, |
461 | false /* log_err_key */); // TODO | |
462 | if (!s.ok()) return s; | |
7c673cae FG |
463 | |
464 | // The key is between [low, high). Do a binary search between it. | |
465 | while (high - low > 1) { | |
466 | uint32_t mid = (high + low) / 2; | |
467 | uint32_t file_offset = GetFixed32Element(base_ptr, mid); | |
468 | uint32_t tmp; | |
20effc67 | 469 | s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); |
7c673cae FG |
470 | if (!s.ok()) { |
471 | return s; | |
472 | } | |
473 | int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); | |
474 | if (cmp_result < 0) { | |
475 | low = mid; | |
476 | } else { | |
477 | if (cmp_result == 0) { | |
478 | // Happen to have found the exact key or target is smaller than the | |
479 | // first key after base_offset. | |
480 | prefix_matched = true; | |
481 | *offset = file_offset; | |
482 | return Status::OK(); | |
483 | } else { | |
484 | high = mid; | |
485 | } | |
486 | } | |
487 | } | |
488 | // Both of the key at the position low or low+1 could share the same | |
489 | // prefix as target. We need to rule out one of them to avoid to go | |
490 | // to the wrong prefix. | |
491 | ParsedInternalKey low_key; | |
492 | uint32_t tmp; | |
493 | uint32_t low_key_offset = GetFixed32Element(base_ptr, low); | |
20effc67 | 494 | s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); |
7c673cae FG |
495 | if (!s.ok()) { |
496 | return s; | |
497 | } | |
498 | ||
499 | if (GetPrefix(low_key) == prefix) { | |
500 | prefix_matched = true; | |
501 | *offset = low_key_offset; | |
502 | } else if (low + 1 < upper_bound) { | |
503 | // There is possible a next prefix, return it | |
504 | prefix_matched = false; | |
505 | *offset = GetFixed32Element(base_ptr, low + 1); | |
506 | } else { | |
507 | // target is larger than a key of the last prefix in this bucket | |
508 | // but with a different prefix. Key does not exist. | |
509 | *offset = file_info_.data_end_offset; | |
510 | } | |
511 | return Status::OK(); | |
512 | } | |
513 | ||
514 | bool PlainTableReader::MatchBloom(uint32_t hash) const { | |
515 | if (!enable_bloom_) { | |
516 | return true; | |
517 | } | |
518 | ||
519 | if (bloom_.MayContainHash(hash)) { | |
520 | PERF_COUNTER_ADD(bloom_sst_hit_count, 1); | |
521 | return true; | |
522 | } else { | |
523 | PERF_COUNTER_ADD(bloom_sst_miss_count, 1); | |
524 | return false; | |
525 | } | |
526 | } | |
527 | ||
528 | Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, | |
529 | ParsedInternalKey* parsed_key, | |
530 | Slice* internal_key, Slice* value, | |
531 | bool* seekable) const { | |
532 | if (*offset == file_info_.data_end_offset) { | |
533 | *offset = file_info_.data_end_offset; | |
534 | return Status::OK(); | |
535 | } | |
536 | ||
537 | if (*offset > file_info_.data_end_offset) { | |
538 | return Status::Corruption("Offset is out of file size"); | |
539 | } | |
540 | ||
541 | uint32_t bytes_read; | |
542 | Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, | |
543 | &bytes_read, seekable); | |
544 | if (!s.ok()) { | |
545 | return s; | |
546 | } | |
547 | *offset = *offset + bytes_read; | |
548 | return Status::OK(); | |
549 | } | |
550 | ||
551 | void PlainTableReader::Prepare(const Slice& target) { | |
552 | if (enable_bloom_) { | |
553 | uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); | |
554 | bloom_.Prefetch(prefix_hash); | |
555 | } | |
556 | } | |
557 | ||
11fdf7f2 TL |
558 | Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, |
559 | GetContext* get_context, | |
560 | const SliceTransform* /* prefix_extractor */, | |
561 | bool /*skip_filters*/) { | |
7c673cae FG |
562 | // Check bloom filter first. |
563 | Slice prefix_slice; | |
564 | uint32_t prefix_hash; | |
565 | if (IsTotalOrderMode()) { | |
566 | if (full_scan_mode_) { | |
567 | status_ = | |
568 | Status::InvalidArgument("Get() is not allowed in full scan mode."); | |
569 | } | |
570 | // Match whole user key for bloom filter check. | |
571 | if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { | |
572 | return Status::OK(); | |
573 | } | |
574 | // in total order mode, there is only one bucket 0, and we always use empty | |
575 | // prefix. | |
576 | prefix_slice = Slice(); | |
577 | prefix_hash = 0; | |
578 | } else { | |
579 | prefix_slice = GetPrefix(target); | |
580 | prefix_hash = GetSliceHash(prefix_slice); | |
581 | if (!MatchBloom(prefix_hash)) { | |
582 | return Status::OK(); | |
583 | } | |
584 | } | |
585 | uint32_t offset; | |
586 | bool prefix_match; | |
587 | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, | |
11fdf7f2 | 588 | prefix_extractor_); |
7c673cae FG |
589 | Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, |
590 | prefix_match, &offset); | |
591 | ||
592 | if (!s.ok()) { | |
593 | return s; | |
594 | } | |
595 | ParsedInternalKey found_key; | |
596 | ParsedInternalKey parsed_target; | |
20effc67 TL |
597 | s = ParseInternalKey(target, &parsed_target, |
598 | false /* log_err_key */); // TODO | |
599 | if (!s.ok()) return s; | |
600 | ||
7c673cae FG |
601 | Slice found_value; |
602 | while (offset < file_info_.data_end_offset) { | |
603 | s = Next(&decoder, &offset, &found_key, nullptr, &found_value); | |
604 | if (!s.ok()) { | |
605 | return s; | |
606 | } | |
607 | if (!prefix_match) { | |
608 | // Need to verify prefix for the first key found if it is not yet | |
609 | // checked. | |
610 | if (GetPrefix(found_key) != prefix_slice) { | |
611 | return Status::OK(); | |
612 | } | |
613 | prefix_match = true; | |
614 | } | |
615 | // TODO(ljin): since we know the key comparison result here, | |
616 | // can we enable the fast path? | |
617 | if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { | |
11fdf7f2 | 618 | bool dont_care __attribute__((__unused__)); |
494da23a TL |
619 | if (!get_context->SaveValue(found_key, found_value, &dont_care, |
620 | dummy_cleanable_.get())) { | |
7c673cae FG |
621 | break; |
622 | } | |
623 | } | |
624 | } | |
625 | return Status::OK(); | |
626 | } | |
627 | ||
f67539c2 TL |
628 | uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, |
629 | TableReaderCaller /*caller*/) { | |
630 | return 0; | |
631 | } | |
632 | ||
633 | uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, | |
634 | const Slice& /*end*/, | |
635 | TableReaderCaller /*caller*/) { | |
7c673cae FG |
636 | return 0; |
637 | } | |
638 | ||
639 | PlainTableIterator::PlainTableIterator(PlainTableReader* table, | |
640 | bool use_prefix_seek) | |
641 | : table_(table), | |
642 | decoder_(&table_->file_info_, table_->encoding_type_, | |
643 | table_->user_key_len_, table_->prefix_extractor_), | |
644 | use_prefix_seek_(use_prefix_seek) { | |
645 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
646 | } | |
647 | ||
648 | PlainTableIterator::~PlainTableIterator() { | |
649 | } | |
650 | ||
651 | bool PlainTableIterator::Valid() const { | |
652 | return offset_ < table_->file_info_.data_end_offset && | |
653 | offset_ >= table_->data_start_offset_; | |
654 | } | |
655 | ||
656 | void PlainTableIterator::SeekToFirst() { | |
11fdf7f2 | 657 | status_ = Status::OK(); |
7c673cae FG |
658 | next_offset_ = table_->data_start_offset_; |
659 | if (next_offset_ >= table_->file_info_.data_end_offset) { | |
660 | next_offset_ = offset_ = table_->file_info_.data_end_offset; | |
661 | } else { | |
662 | Next(); | |
663 | } | |
664 | } | |
665 | ||
666 | void PlainTableIterator::SeekToLast() { | |
667 | assert(false); | |
668 | status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); | |
11fdf7f2 | 669 | next_offset_ = offset_ = table_->file_info_.data_end_offset; |
7c673cae FG |
670 | } |
671 | ||
672 | void PlainTableIterator::Seek(const Slice& target) { | |
11fdf7f2 TL |
673 | if (use_prefix_seek_ != !table_->IsTotalOrderMode()) { |
674 | // This check is done here instead of NewIterator() to permit creating an | |
675 | // iterator with total_order_seek = true even if we won't be able to Seek() | |
676 | // it. This is needed for compaction: it creates iterator with | |
677 | // total_order_seek = true but usually never does Seek() on it, | |
678 | // only SeekToFirst(). | |
679 | status_ = | |
680 | Status::InvalidArgument( | |
681 | "total_order_seek not implemented for PlainTable."); | |
682 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
683 | return; | |
684 | } | |
685 | ||
7c673cae FG |
686 | // If the user doesn't set prefix seek option and we are not able to do a |
687 | // total Seek(). assert failure. | |
11fdf7f2 | 688 | if (table_->IsTotalOrderMode()) { |
7c673cae FG |
689 | if (table_->full_scan_mode_) { |
690 | status_ = | |
691 | Status::InvalidArgument("Seek() is not allowed in full scan mode."); | |
692 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
693 | return; | |
694 | } else if (table_->GetIndexSize() > 1) { | |
695 | assert(false); | |
696 | status_ = Status::NotSupported( | |
697 | "PlainTable cannot issue non-prefix seek unless in total order " | |
698 | "mode."); | |
699 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
700 | return; | |
701 | } | |
702 | } | |
703 | ||
704 | Slice prefix_slice = table_->GetPrefix(target); | |
705 | uint32_t prefix_hash = 0; | |
706 | // Bloom filter is ignored in total-order mode. | |
707 | if (!table_->IsTotalOrderMode()) { | |
708 | prefix_hash = GetSliceHash(prefix_slice); | |
709 | if (!table_->MatchBloom(prefix_hash)) { | |
11fdf7f2 | 710 | status_ = Status::OK(); |
7c673cae FG |
711 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
712 | return; | |
713 | } | |
714 | } | |
715 | bool prefix_match; | |
716 | status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, | |
717 | prefix_match, &next_offset_); | |
718 | if (!status_.ok()) { | |
719 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
720 | return; | |
721 | } | |
722 | ||
723 | if (next_offset_ < table_->file_info_.data_end_offset) { | |
724 | for (Next(); status_.ok() && Valid(); Next()) { | |
725 | if (!prefix_match) { | |
726 | // Need to verify the first key's prefix | |
727 | if (table_->GetPrefix(key()) != prefix_slice) { | |
728 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
729 | break; | |
730 | } | |
731 | prefix_match = true; | |
732 | } | |
733 | if (table_->internal_comparator_.Compare(key(), target) >= 0) { | |
734 | break; | |
735 | } | |
736 | } | |
737 | } else { | |
738 | offset_ = table_->file_info_.data_end_offset; | |
739 | } | |
740 | } | |
741 | ||
11fdf7f2 | 742 | void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { |
7c673cae FG |
743 | assert(false); |
744 | status_ = | |
745 | Status::NotSupported("SeekForPrev() is not supported in PlainTable"); | |
11fdf7f2 | 746 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
7c673cae FG |
747 | } |
748 | ||
749 | void PlainTableIterator::Next() { | |
750 | offset_ = next_offset_; | |
751 | if (offset_ < table_->file_info_.data_end_offset) { | |
752 | Slice tmp_slice; | |
753 | ParsedInternalKey parsed_key; | |
754 | status_ = | |
755 | table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); | |
756 | if (!status_.ok()) { | |
757 | offset_ = next_offset_ = table_->file_info_.data_end_offset; | |
758 | } | |
759 | } | |
760 | } | |
761 | ||
762 | void PlainTableIterator::Prev() { | |
763 | assert(false); | |
764 | } | |
765 | ||
766 | Slice PlainTableIterator::key() const { | |
767 | assert(Valid()); | |
768 | return key_; | |
769 | } | |
770 | ||
771 | Slice PlainTableIterator::value() const { | |
772 | assert(Valid()); | |
773 | return value_; | |
774 | } | |
775 | ||
776 | Status PlainTableIterator::status() const { | |
777 | return status_; | |
778 | } | |
779 | ||
f67539c2 | 780 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 781 | #endif // ROCKSDB_LITE |