]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #ifndef ROCKSDB_LITE | |
f67539c2 | 11 | #include "table/cuckoo/cuckoo_table_reader.h" |
7c673cae FG |
12 | |
13 | #include <algorithm> | |
14 | #include <limits> | |
15 | #include <string> | |
16 | #include <utility> | |
17 | #include <vector> | |
f67539c2 | 18 | #include "memory/arena.h" |
7c673cae FG |
19 | #include "rocksdb/iterator.h" |
20 | #include "rocksdb/table.h" | |
f67539c2 TL |
21 | #include "table/cuckoo/cuckoo_table_factory.h" |
22 | #include "table/get_context.h" | |
7c673cae FG |
23 | #include "table/internal_iterator.h" |
24 | #include "table/meta_blocks.h" | |
7c673cae FG |
25 | #include "util/coding.h" |
26 | ||
f67539c2 | 27 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
28 | namespace { |
29 | const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); | |
30 | const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); | |
31 | } | |
32 | ||
33 | extern const uint64_t kCuckooTableMagicNumber; | |
34 | ||
35 | CuckooTableReader::CuckooTableReader( | |
36 | const ImmutableCFOptions& ioptions, | |
37 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, | |
38 | const Comparator* comparator, | |
39 | uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) | |
40 | : file_(std::move(file)), | |
11fdf7f2 TL |
41 | is_last_level_(false), |
42 | identity_as_first_hash_(false), | |
43 | use_module_hash_(false), | |
44 | num_hash_func_(0), | |
45 | unused_key_(""), | |
46 | key_length_(0), | |
47 | user_key_length_(0), | |
48 | value_length_(0), | |
49 | bucket_length_(0), | |
50 | cuckoo_block_size_(0), | |
51 | cuckoo_block_bytes_minus_one_(0), | |
52 | table_size_(0), | |
7c673cae FG |
53 | ucomp_(comparator), |
54 | get_slice_hash_(get_slice_hash) { | |
55 | if (!ioptions.allow_mmap_reads) { | |
56 | status_ = Status::InvalidArgument("File is not mmaped"); | |
20effc67 | 57 | return; |
7c673cae FG |
58 | } |
59 | TableProperties* props = nullptr; | |
60 | status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, | |
11fdf7f2 | 61 | ioptions, &props, true /* compression_type_missing */); |
7c673cae FG |
62 | if (!status_.ok()) { |
63 | return; | |
64 | } | |
65 | table_props_.reset(props); | |
66 | auto& user_props = props->user_collected_properties; | |
67 | auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); | |
68 | if (hash_funs == user_props.end()) { | |
69 | status_ = Status::Corruption("Number of hash functions not found"); | |
70 | return; | |
71 | } | |
72 | num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); | |
73 | auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); | |
74 | if (unused_key == user_props.end()) { | |
75 | status_ = Status::Corruption("Empty bucket value not found"); | |
76 | return; | |
77 | } | |
78 | unused_key_ = unused_key->second; | |
79 | ||
80 | key_length_ = static_cast<uint32_t>(props->fixed_key_len); | |
81 | auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); | |
82 | if (user_key_len == user_props.end()) { | |
83 | status_ = Status::Corruption("User key length not found"); | |
84 | return; | |
85 | } | |
86 | user_key_length_ = *reinterpret_cast<const uint32_t*>( | |
87 | user_key_len->second.data()); | |
88 | ||
89 | auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); | |
90 | if (value_length == user_props.end()) { | |
91 | status_ = Status::Corruption("Value length not found"); | |
92 | return; | |
93 | } | |
94 | value_length_ = *reinterpret_cast<const uint32_t*>( | |
95 | value_length->second.data()); | |
96 | bucket_length_ = key_length_ + value_length_; | |
97 | ||
98 | auto hash_table_size = user_props.find( | |
99 | CuckooTablePropertyNames::kHashTableSize); | |
100 | if (hash_table_size == user_props.end()) { | |
101 | status_ = Status::Corruption("Hash table size not found"); | |
102 | return; | |
103 | } | |
104 | table_size_ = *reinterpret_cast<const uint64_t*>( | |
105 | hash_table_size->second.data()); | |
106 | ||
107 | auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); | |
108 | if (is_last_level == user_props.end()) { | |
109 | status_ = Status::Corruption("Is last level not found"); | |
110 | return; | |
111 | } | |
112 | is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); | |
113 | ||
114 | auto identity_as_first_hash = user_props.find( | |
115 | CuckooTablePropertyNames::kIdentityAsFirstHash); | |
116 | if (identity_as_first_hash == user_props.end()) { | |
117 | status_ = Status::Corruption("identity as first hash not found"); | |
118 | return; | |
119 | } | |
120 | identity_as_first_hash_ = *reinterpret_cast<const bool*>( | |
121 | identity_as_first_hash->second.data()); | |
122 | ||
123 | auto use_module_hash = user_props.find( | |
124 | CuckooTablePropertyNames::kUseModuleHash); | |
125 | if (use_module_hash == user_props.end()) { | |
126 | status_ = Status::Corruption("hash type is not found"); | |
127 | return; | |
128 | } | |
129 | use_module_hash_ = *reinterpret_cast<const bool*>( | |
130 | use_module_hash->second.data()); | |
131 | auto cuckoo_block_size = user_props.find( | |
132 | CuckooTablePropertyNames::kCuckooBlockSize); | |
133 | if (cuckoo_block_size == user_props.end()) { | |
134 | status_ = Status::Corruption("Cuckoo block size not found"); | |
135 | return; | |
136 | } | |
137 | cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( | |
138 | cuckoo_block_size->second.data()); | |
139 | cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; | |
20effc67 TL |
140 | status_ = file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), |
141 | &file_data_, nullptr, nullptr); | |
7c673cae FG |
142 | } |
143 | ||
11fdf7f2 TL |
144 | Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, |
145 | const Slice& key, GetContext* get_context, | |
146 | const SliceTransform* /* prefix_extractor */, | |
147 | bool /*skip_filters*/) { | |
7c673cae FG |
148 | assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); |
149 | Slice user_key = ExtractUserKey(key); | |
150 | for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { | |
151 | uint64_t offset = bucket_length_ * CuckooHash( | |
152 | user_key, hash_cnt, use_module_hash_, table_size_, | |
153 | identity_as_first_hash_, get_slice_hash_); | |
154 | const char* bucket = &file_data_.data()[offset]; | |
155 | for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; | |
156 | ++block_idx, bucket += bucket_length_) { | |
157 | if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), | |
158 | Slice(bucket, user_key.size()))) { | |
159 | return Status::OK(); | |
160 | } | |
161 | // Here, we compare only the user key part as we support only one entry | |
162 | // per user key and we don't support snapshot. | |
163 | if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { | |
164 | Slice value(bucket + key_length_, value_length_); | |
165 | if (is_last_level_) { | |
166 | // Sequence number is not stored at the last level, so we will use | |
167 | // kMaxSequenceNumber since it is unknown. This could cause some | |
168 | // transactions to fail to lock a key due to known sequence number. | |
169 | // However, it is expected for anyone to use a CuckooTable in a | |
170 | // TransactionDB. | |
171 | get_context->SaveValue(value, kMaxSequenceNumber); | |
172 | } else { | |
173 | Slice full_key(bucket, key_length_); | |
174 | ParsedInternalKey found_ikey; | |
20effc67 TL |
175 | Status s = ParseInternalKey(full_key, &found_ikey, |
176 | false /* log_err_key */); // TODO | |
177 | if (!s.ok()) return s; | |
11fdf7f2 TL |
178 | bool dont_care __attribute__((__unused__)); |
179 | get_context->SaveValue(found_ikey, value, &dont_care); | |
7c673cae FG |
180 | } |
181 | // We don't support merge operations. So, we return here. | |
182 | return Status::OK(); | |
183 | } | |
184 | } | |
185 | } | |
186 | return Status::OK(); | |
187 | } | |
188 | ||
189 | void CuckooTableReader::Prepare(const Slice& key) { | |
190 | // Prefetch the first Cuckoo Block. | |
191 | Slice user_key = ExtractUserKey(key); | |
192 | uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) + | |
193 | bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, | |
194 | identity_as_first_hash_, nullptr); | |
195 | uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; | |
196 | for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { | |
197 | PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); | |
198 | } | |
199 | } | |
200 | ||
201 | class CuckooTableIterator : public InternalIterator { | |
202 | public: | |
203 | explicit CuckooTableIterator(CuckooTableReader* reader); | |
f67539c2 TL |
204 | // No copying allowed |
205 | CuckooTableIterator(const CuckooTableIterator&) = delete; | |
206 | void operator=(const Iterator&) = delete; | |
494da23a | 207 | ~CuckooTableIterator() override {} |
7c673cae FG |
208 | bool Valid() const override; |
209 | void SeekToFirst() override; | |
210 | void SeekToLast() override; | |
211 | void Seek(const Slice& target) override; | |
212 | void SeekForPrev(const Slice& target) override; | |
213 | void Next() override; | |
214 | void Prev() override; | |
215 | Slice key() const override; | |
216 | Slice value() const override; | |
11fdf7f2 | 217 | Status status() const override { return Status::OK(); } |
7c673cae FG |
218 | void InitIfNeeded(); |
219 | ||
220 | private: | |
221 | struct BucketComparator { | |
222 | BucketComparator(const Slice& file_data, const Comparator* ucomp, | |
223 | uint32_t bucket_len, uint32_t user_key_len, | |
224 | const Slice& target = Slice()) | |
225 | : file_data_(file_data), | |
226 | ucomp_(ucomp), | |
227 | bucket_len_(bucket_len), | |
228 | user_key_len_(user_key_len), | |
229 | target_(target) {} | |
230 | bool operator()(const uint32_t first, const uint32_t second) const { | |
231 | const char* first_bucket = | |
232 | (first == kInvalidIndex) ? target_.data() : | |
233 | &file_data_.data()[first * bucket_len_]; | |
234 | const char* second_bucket = | |
235 | (second == kInvalidIndex) ? target_.data() : | |
236 | &file_data_.data()[second * bucket_len_]; | |
237 | return ucomp_->Compare(Slice(first_bucket, user_key_len_), | |
238 | Slice(second_bucket, user_key_len_)) < 0; | |
239 | } | |
240 | private: | |
241 | const Slice file_data_; | |
242 | const Comparator* ucomp_; | |
243 | const uint32_t bucket_len_; | |
244 | const uint32_t user_key_len_; | |
245 | const Slice target_; | |
246 | }; | |
247 | ||
248 | const BucketComparator bucket_comparator_; | |
249 | void PrepareKVAtCurrIdx(); | |
250 | CuckooTableReader* reader_; | |
251 | bool initialized_; | |
7c673cae FG |
252 | // Contains a map of keys to bucket_id sorted in key order. |
253 | std::vector<uint32_t> sorted_bucket_ids_; | |
254 | // We assume that the number of items can be stored in uint32 (4 Billion). | |
255 | uint32_t curr_key_idx_; | |
256 | Slice curr_value_; | |
257 | IterKey curr_key_; | |
7c673cae FG |
258 | }; |
259 | ||
260 | CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) | |
261 | : bucket_comparator_(reader->file_data_, reader->ucomp_, | |
262 | reader->bucket_length_, reader->user_key_length_), | |
263 | reader_(reader), | |
264 | initialized_(false), | |
265 | curr_key_idx_(kInvalidIndex) { | |
266 | sorted_bucket_ids_.clear(); | |
267 | curr_value_.clear(); | |
268 | curr_key_.Clear(); | |
269 | } | |
270 | ||
271 | void CuckooTableIterator::InitIfNeeded() { | |
272 | if (initialized_) { | |
273 | return; | |
274 | } | |
11fdf7f2 | 275 | sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries)); |
7c673cae FG |
276 | uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; |
277 | assert(num_buckets < kInvalidIndex); | |
278 | const char* bucket = reader_->file_data_.data(); | |
279 | for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { | |
280 | if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { | |
281 | sorted_bucket_ids_.push_back(bucket_id); | |
282 | } | |
283 | bucket += reader_->bucket_length_; | |
284 | } | |
285 | assert(sorted_bucket_ids_.size() == | |
286 | reader_->GetTableProperties()->num_entries); | |
287 | std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), | |
288 | bucket_comparator_); | |
289 | curr_key_idx_ = kInvalidIndex; | |
290 | initialized_ = true; | |
291 | } | |
292 | ||
293 | void CuckooTableIterator::SeekToFirst() { | |
294 | InitIfNeeded(); | |
295 | curr_key_idx_ = 0; | |
296 | PrepareKVAtCurrIdx(); | |
297 | } | |
298 | ||
299 | void CuckooTableIterator::SeekToLast() { | |
300 | InitIfNeeded(); | |
301 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; | |
302 | PrepareKVAtCurrIdx(); | |
303 | } | |
304 | ||
305 | void CuckooTableIterator::Seek(const Slice& target) { | |
306 | InitIfNeeded(); | |
307 | const BucketComparator seek_comparator( | |
308 | reader_->file_data_, reader_->ucomp_, | |
309 | reader_->bucket_length_, reader_->user_key_length_, | |
310 | ExtractUserKey(target)); | |
311 | auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), | |
312 | sorted_bucket_ids_.end(), | |
313 | kInvalidIndex, | |
314 | seek_comparator); | |
315 | curr_key_idx_ = | |
316 | static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); | |
317 | PrepareKVAtCurrIdx(); | |
318 | } | |
319 | ||
11fdf7f2 | 320 | void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { |
7c673cae FG |
321 | // Not supported |
322 | assert(false); | |
323 | } | |
324 | ||
325 | bool CuckooTableIterator::Valid() const { | |
326 | return curr_key_idx_ < sorted_bucket_ids_.size(); | |
327 | } | |
328 | ||
329 | void CuckooTableIterator::PrepareKVAtCurrIdx() { | |
330 | if (!Valid()) { | |
331 | curr_value_.clear(); | |
332 | curr_key_.Clear(); | |
333 | return; | |
334 | } | |
335 | uint32_t id = sorted_bucket_ids_[curr_key_idx_]; | |
336 | const char* offset = reader_->file_data_.data() + | |
337 | id * reader_->bucket_length_; | |
338 | if (reader_->is_last_level_) { | |
339 | // Always return internal key. | |
340 | curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), | |
341 | 0, kTypeValue); | |
342 | } else { | |
343 | curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); | |
344 | } | |
345 | curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); | |
346 | } | |
347 | ||
348 | void CuckooTableIterator::Next() { | |
349 | if (!Valid()) { | |
350 | curr_value_.clear(); | |
351 | curr_key_.Clear(); | |
352 | return; | |
353 | } | |
354 | ++curr_key_idx_; | |
355 | PrepareKVAtCurrIdx(); | |
356 | } | |
357 | ||
358 | void CuckooTableIterator::Prev() { | |
359 | if (curr_key_idx_ == 0) { | |
360 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); | |
361 | } | |
362 | if (!Valid()) { | |
363 | curr_value_.clear(); | |
364 | curr_key_.Clear(); | |
365 | return; | |
366 | } | |
367 | --curr_key_idx_; | |
368 | PrepareKVAtCurrIdx(); | |
369 | } | |
370 | ||
371 | Slice CuckooTableIterator::key() const { | |
372 | assert(Valid()); | |
373 | return curr_key_.GetInternalKey(); | |
374 | } | |
375 | ||
376 | Slice CuckooTableIterator::value() const { | |
377 | assert(Valid()); | |
378 | return curr_value_; | |
379 | } | |
380 | ||
7c673cae | 381 | InternalIterator* CuckooTableReader::NewIterator( |
11fdf7f2 TL |
382 | const ReadOptions& /*read_options*/, |
383 | const SliceTransform* /* prefix_extractor */, Arena* arena, | |
f67539c2 | 384 | bool /*skip_filters*/, TableReaderCaller /*caller*/, |
20effc67 TL |
385 | size_t /*compaction_readahead_size*/, |
386 | bool /* allow_unprepared_value */) { | |
7c673cae | 387 | if (!status().ok()) { |
11fdf7f2 | 388 | return NewErrorInternalIterator<Slice>( |
7c673cae FG |
389 | Status::Corruption("CuckooTableReader status is not okay."), arena); |
390 | } | |
7c673cae FG |
391 | CuckooTableIterator* iter; |
392 | if (arena == nullptr) { | |
393 | iter = new CuckooTableIterator(this); | |
394 | } else { | |
395 | auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); | |
396 | iter = new (iter_mem) CuckooTableIterator(this); | |
397 | } | |
398 | return iter; | |
399 | } | |
400 | ||
401 | size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } | |
402 | ||
f67539c2 | 403 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 404 | #endif |