]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #ifndef ROCKSDB_LITE | |
11 | #include "table/cuckoo_table_reader.h" | |
12 | ||
13 | #include <algorithm> | |
14 | #include <limits> | |
15 | #include <string> | |
16 | #include <utility> | |
17 | #include <vector> | |
18 | #include "rocksdb/iterator.h" | |
19 | #include "rocksdb/table.h" | |
20 | #include "table/internal_iterator.h" | |
21 | #include "table/meta_blocks.h" | |
22 | #include "table/cuckoo_table_factory.h" | |
23 | #include "table/get_context.h" | |
24 | #include "util/arena.h" | |
25 | #include "util/coding.h" | |
26 | ||
27 | namespace rocksdb { | |
28 | namespace { | |
29 | const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); | |
30 | const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); | |
31 | } | |
32 | ||
33 | extern const uint64_t kCuckooTableMagicNumber; | |
34 | ||
35 | CuckooTableReader::CuckooTableReader( | |
36 | const ImmutableCFOptions& ioptions, | |
37 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, | |
38 | const Comparator* comparator, | |
39 | uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) | |
40 | : file_(std::move(file)), | |
11fdf7f2 TL |
41 | is_last_level_(false), |
42 | identity_as_first_hash_(false), | |
43 | use_module_hash_(false), | |
44 | num_hash_func_(0), | |
45 | unused_key_(""), | |
46 | key_length_(0), | |
47 | user_key_length_(0), | |
48 | value_length_(0), | |
49 | bucket_length_(0), | |
50 | cuckoo_block_size_(0), | |
51 | cuckoo_block_bytes_minus_one_(0), | |
52 | table_size_(0), | |
7c673cae FG |
53 | ucomp_(comparator), |
54 | get_slice_hash_(get_slice_hash) { | |
55 | if (!ioptions.allow_mmap_reads) { | |
56 | status_ = Status::InvalidArgument("File is not mmaped"); | |
57 | } | |
58 | TableProperties* props = nullptr; | |
59 | status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, | |
11fdf7f2 | 60 | ioptions, &props, true /* compression_type_missing */); |
7c673cae FG |
61 | if (!status_.ok()) { |
62 | return; | |
63 | } | |
64 | table_props_.reset(props); | |
65 | auto& user_props = props->user_collected_properties; | |
66 | auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); | |
67 | if (hash_funs == user_props.end()) { | |
68 | status_ = Status::Corruption("Number of hash functions not found"); | |
69 | return; | |
70 | } | |
71 | num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); | |
72 | auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); | |
73 | if (unused_key == user_props.end()) { | |
74 | status_ = Status::Corruption("Empty bucket value not found"); | |
75 | return; | |
76 | } | |
77 | unused_key_ = unused_key->second; | |
78 | ||
79 | key_length_ = static_cast<uint32_t>(props->fixed_key_len); | |
80 | auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); | |
81 | if (user_key_len == user_props.end()) { | |
82 | status_ = Status::Corruption("User key length not found"); | |
83 | return; | |
84 | } | |
85 | user_key_length_ = *reinterpret_cast<const uint32_t*>( | |
86 | user_key_len->second.data()); | |
87 | ||
88 | auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); | |
89 | if (value_length == user_props.end()) { | |
90 | status_ = Status::Corruption("Value length not found"); | |
91 | return; | |
92 | } | |
93 | value_length_ = *reinterpret_cast<const uint32_t*>( | |
94 | value_length->second.data()); | |
95 | bucket_length_ = key_length_ + value_length_; | |
96 | ||
97 | auto hash_table_size = user_props.find( | |
98 | CuckooTablePropertyNames::kHashTableSize); | |
99 | if (hash_table_size == user_props.end()) { | |
100 | status_ = Status::Corruption("Hash table size not found"); | |
101 | return; | |
102 | } | |
103 | table_size_ = *reinterpret_cast<const uint64_t*>( | |
104 | hash_table_size->second.data()); | |
105 | ||
106 | auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); | |
107 | if (is_last_level == user_props.end()) { | |
108 | status_ = Status::Corruption("Is last level not found"); | |
109 | return; | |
110 | } | |
111 | is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); | |
112 | ||
113 | auto identity_as_first_hash = user_props.find( | |
114 | CuckooTablePropertyNames::kIdentityAsFirstHash); | |
115 | if (identity_as_first_hash == user_props.end()) { | |
116 | status_ = Status::Corruption("identity as first hash not found"); | |
117 | return; | |
118 | } | |
119 | identity_as_first_hash_ = *reinterpret_cast<const bool*>( | |
120 | identity_as_first_hash->second.data()); | |
121 | ||
122 | auto use_module_hash = user_props.find( | |
123 | CuckooTablePropertyNames::kUseModuleHash); | |
124 | if (use_module_hash == user_props.end()) { | |
125 | status_ = Status::Corruption("hash type is not found"); | |
126 | return; | |
127 | } | |
128 | use_module_hash_ = *reinterpret_cast<const bool*>( | |
129 | use_module_hash->second.data()); | |
130 | auto cuckoo_block_size = user_props.find( | |
131 | CuckooTablePropertyNames::kCuckooBlockSize); | |
132 | if (cuckoo_block_size == user_props.end()) { | |
133 | status_ = Status::Corruption("Cuckoo block size not found"); | |
134 | return; | |
135 | } | |
136 | cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( | |
137 | cuckoo_block_size->second.data()); | |
138 | cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; | |
11fdf7f2 | 139 | status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr); |
7c673cae FG |
140 | } |
141 | ||
11fdf7f2 TL |
142 | Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, |
143 | const Slice& key, GetContext* get_context, | |
144 | const SliceTransform* /* prefix_extractor */, | |
145 | bool /*skip_filters*/) { | |
7c673cae FG |
146 | assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); |
147 | Slice user_key = ExtractUserKey(key); | |
148 | for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { | |
149 | uint64_t offset = bucket_length_ * CuckooHash( | |
150 | user_key, hash_cnt, use_module_hash_, table_size_, | |
151 | identity_as_first_hash_, get_slice_hash_); | |
152 | const char* bucket = &file_data_.data()[offset]; | |
153 | for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; | |
154 | ++block_idx, bucket += bucket_length_) { | |
155 | if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), | |
156 | Slice(bucket, user_key.size()))) { | |
157 | return Status::OK(); | |
158 | } | |
159 | // Here, we compare only the user key part as we support only one entry | |
160 | // per user key and we don't support snapshot. | |
161 | if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { | |
162 | Slice value(bucket + key_length_, value_length_); | |
163 | if (is_last_level_) { | |
164 | // Sequence number is not stored at the last level, so we will use | |
165 | // kMaxSequenceNumber since it is unknown. This could cause some | |
166 | // transactions to fail to lock a key due to known sequence number. | |
167 | // However, it is expected for anyone to use a CuckooTable in a | |
168 | // TransactionDB. | |
169 | get_context->SaveValue(value, kMaxSequenceNumber); | |
170 | } else { | |
171 | Slice full_key(bucket, key_length_); | |
172 | ParsedInternalKey found_ikey; | |
173 | ParseInternalKey(full_key, &found_ikey); | |
11fdf7f2 TL |
174 | bool dont_care __attribute__((__unused__)); |
175 | get_context->SaveValue(found_ikey, value, &dont_care); | |
7c673cae FG |
176 | } |
177 | // We don't support merge operations. So, we return here. | |
178 | return Status::OK(); | |
179 | } | |
180 | } | |
181 | } | |
182 | return Status::OK(); | |
183 | } | |
184 | ||
185 | void CuckooTableReader::Prepare(const Slice& key) { | |
186 | // Prefetch the first Cuckoo Block. | |
187 | Slice user_key = ExtractUserKey(key); | |
188 | uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) + | |
189 | bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, | |
190 | identity_as_first_hash_, nullptr); | |
191 | uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; | |
192 | for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { | |
193 | PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); | |
194 | } | |
195 | } | |
196 | ||
197 | class CuckooTableIterator : public InternalIterator { | |
198 | public: | |
199 | explicit CuckooTableIterator(CuckooTableReader* reader); | |
494da23a | 200 | ~CuckooTableIterator() override {} |
7c673cae FG |
201 | bool Valid() const override; |
202 | void SeekToFirst() override; | |
203 | void SeekToLast() override; | |
204 | void Seek(const Slice& target) override; | |
205 | void SeekForPrev(const Slice& target) override; | |
206 | void Next() override; | |
207 | void Prev() override; | |
208 | Slice key() const override; | |
209 | Slice value() const override; | |
11fdf7f2 | 210 | Status status() const override { return Status::OK(); } |
7c673cae FG |
211 | void InitIfNeeded(); |
212 | ||
213 | private: | |
214 | struct BucketComparator { | |
215 | BucketComparator(const Slice& file_data, const Comparator* ucomp, | |
216 | uint32_t bucket_len, uint32_t user_key_len, | |
217 | const Slice& target = Slice()) | |
218 | : file_data_(file_data), | |
219 | ucomp_(ucomp), | |
220 | bucket_len_(bucket_len), | |
221 | user_key_len_(user_key_len), | |
222 | target_(target) {} | |
223 | bool operator()(const uint32_t first, const uint32_t second) const { | |
224 | const char* first_bucket = | |
225 | (first == kInvalidIndex) ? target_.data() : | |
226 | &file_data_.data()[first * bucket_len_]; | |
227 | const char* second_bucket = | |
228 | (second == kInvalidIndex) ? target_.data() : | |
229 | &file_data_.data()[second * bucket_len_]; | |
230 | return ucomp_->Compare(Slice(first_bucket, user_key_len_), | |
231 | Slice(second_bucket, user_key_len_)) < 0; | |
232 | } | |
233 | private: | |
234 | const Slice file_data_; | |
235 | const Comparator* ucomp_; | |
236 | const uint32_t bucket_len_; | |
237 | const uint32_t user_key_len_; | |
238 | const Slice target_; | |
239 | }; | |
240 | ||
241 | const BucketComparator bucket_comparator_; | |
242 | void PrepareKVAtCurrIdx(); | |
243 | CuckooTableReader* reader_; | |
244 | bool initialized_; | |
7c673cae FG |
245 | // Contains a map of keys to bucket_id sorted in key order. |
246 | std::vector<uint32_t> sorted_bucket_ids_; | |
247 | // We assume that the number of items can be stored in uint32 (4 Billion). | |
248 | uint32_t curr_key_idx_; | |
249 | Slice curr_value_; | |
250 | IterKey curr_key_; | |
251 | // No copying allowed | |
252 | CuckooTableIterator(const CuckooTableIterator&) = delete; | |
253 | void operator=(const Iterator&) = delete; | |
254 | }; | |
255 | ||
256 | CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) | |
257 | : bucket_comparator_(reader->file_data_, reader->ucomp_, | |
258 | reader->bucket_length_, reader->user_key_length_), | |
259 | reader_(reader), | |
260 | initialized_(false), | |
261 | curr_key_idx_(kInvalidIndex) { | |
262 | sorted_bucket_ids_.clear(); | |
263 | curr_value_.clear(); | |
264 | curr_key_.Clear(); | |
265 | } | |
266 | ||
267 | void CuckooTableIterator::InitIfNeeded() { | |
268 | if (initialized_) { | |
269 | return; | |
270 | } | |
11fdf7f2 | 271 | sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries)); |
7c673cae FG |
272 | uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; |
273 | assert(num_buckets < kInvalidIndex); | |
274 | const char* bucket = reader_->file_data_.data(); | |
275 | for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { | |
276 | if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { | |
277 | sorted_bucket_ids_.push_back(bucket_id); | |
278 | } | |
279 | bucket += reader_->bucket_length_; | |
280 | } | |
281 | assert(sorted_bucket_ids_.size() == | |
282 | reader_->GetTableProperties()->num_entries); | |
283 | std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), | |
284 | bucket_comparator_); | |
285 | curr_key_idx_ = kInvalidIndex; | |
286 | initialized_ = true; | |
287 | } | |
288 | ||
289 | void CuckooTableIterator::SeekToFirst() { | |
290 | InitIfNeeded(); | |
291 | curr_key_idx_ = 0; | |
292 | PrepareKVAtCurrIdx(); | |
293 | } | |
294 | ||
295 | void CuckooTableIterator::SeekToLast() { | |
296 | InitIfNeeded(); | |
297 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; | |
298 | PrepareKVAtCurrIdx(); | |
299 | } | |
300 | ||
301 | void CuckooTableIterator::Seek(const Slice& target) { | |
302 | InitIfNeeded(); | |
303 | const BucketComparator seek_comparator( | |
304 | reader_->file_data_, reader_->ucomp_, | |
305 | reader_->bucket_length_, reader_->user_key_length_, | |
306 | ExtractUserKey(target)); | |
307 | auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), | |
308 | sorted_bucket_ids_.end(), | |
309 | kInvalidIndex, | |
310 | seek_comparator); | |
311 | curr_key_idx_ = | |
312 | static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); | |
313 | PrepareKVAtCurrIdx(); | |
314 | } | |
315 | ||
11fdf7f2 | 316 | void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { |
7c673cae FG |
317 | // Not supported |
318 | assert(false); | |
319 | } | |
320 | ||
321 | bool CuckooTableIterator::Valid() const { | |
322 | return curr_key_idx_ < sorted_bucket_ids_.size(); | |
323 | } | |
324 | ||
325 | void CuckooTableIterator::PrepareKVAtCurrIdx() { | |
326 | if (!Valid()) { | |
327 | curr_value_.clear(); | |
328 | curr_key_.Clear(); | |
329 | return; | |
330 | } | |
331 | uint32_t id = sorted_bucket_ids_[curr_key_idx_]; | |
332 | const char* offset = reader_->file_data_.data() + | |
333 | id * reader_->bucket_length_; | |
334 | if (reader_->is_last_level_) { | |
335 | // Always return internal key. | |
336 | curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), | |
337 | 0, kTypeValue); | |
338 | } else { | |
339 | curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); | |
340 | } | |
341 | curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); | |
342 | } | |
343 | ||
344 | void CuckooTableIterator::Next() { | |
345 | if (!Valid()) { | |
346 | curr_value_.clear(); | |
347 | curr_key_.Clear(); | |
348 | return; | |
349 | } | |
350 | ++curr_key_idx_; | |
351 | PrepareKVAtCurrIdx(); | |
352 | } | |
353 | ||
354 | void CuckooTableIterator::Prev() { | |
355 | if (curr_key_idx_ == 0) { | |
356 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); | |
357 | } | |
358 | if (!Valid()) { | |
359 | curr_value_.clear(); | |
360 | curr_key_.Clear(); | |
361 | return; | |
362 | } | |
363 | --curr_key_idx_; | |
364 | PrepareKVAtCurrIdx(); | |
365 | } | |
366 | ||
367 | Slice CuckooTableIterator::key() const { | |
368 | assert(Valid()); | |
369 | return curr_key_.GetInternalKey(); | |
370 | } | |
371 | ||
372 | Slice CuckooTableIterator::value() const { | |
373 | assert(Valid()); | |
374 | return curr_value_; | |
375 | } | |
376 | ||
7c673cae | 377 | InternalIterator* CuckooTableReader::NewIterator( |
11fdf7f2 TL |
378 | const ReadOptions& /*read_options*/, |
379 | const SliceTransform* /* prefix_extractor */, Arena* arena, | |
380 | bool /*skip_filters*/, bool /*for_compaction*/) { | |
7c673cae | 381 | if (!status().ok()) { |
11fdf7f2 | 382 | return NewErrorInternalIterator<Slice>( |
7c673cae FG |
383 | Status::Corruption("CuckooTableReader status is not okay."), arena); |
384 | } | |
7c673cae FG |
385 | CuckooTableIterator* iter; |
386 | if (arena == nullptr) { | |
387 | iter = new CuckooTableIterator(this); | |
388 | } else { | |
389 | auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); | |
390 | iter = new (iter_mem) CuckooTableIterator(this); | |
391 | } | |
392 | return iter; | |
393 | } | |
394 | ||
395 | size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } | |
396 | ||
397 | } // namespace rocksdb | |
398 | #endif |