]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #ifndef ROCKSDB_LITE | |
f67539c2 | 11 | #include "table/cuckoo/cuckoo_table_reader.h" |
7c673cae FG |
12 | |
13 | #include <algorithm> | |
14 | #include <limits> | |
15 | #include <string> | |
16 | #include <utility> | |
17 | #include <vector> | |
1e59de90 | 18 | |
f67539c2 | 19 | #include "memory/arena.h" |
1e59de90 | 20 | #include "options/cf_options.h" |
7c673cae FG |
21 | #include "rocksdb/iterator.h" |
22 | #include "rocksdb/table.h" | |
f67539c2 TL |
23 | #include "table/cuckoo/cuckoo_table_factory.h" |
24 | #include "table/get_context.h" | |
7c673cae FG |
25 | #include "table/internal_iterator.h" |
26 | #include "table/meta_blocks.h" | |
7c673cae FG |
27 | #include "util/coding.h" |
28 | ||
f67539c2 | 29 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
30 | namespace { |
31 | const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); | |
32 | const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); | |
1e59de90 | 33 | } // namespace |
7c673cae FG |
34 | |
35 | extern const uint64_t kCuckooTableMagicNumber; | |
36 | ||
37 | CuckooTableReader::CuckooTableReader( | |
1e59de90 | 38 | const ImmutableOptions& ioptions, |
7c673cae FG |
39 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
40 | const Comparator* comparator, | |
41 | uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) | |
42 | : file_(std::move(file)), | |
11fdf7f2 TL |
43 | is_last_level_(false), |
44 | identity_as_first_hash_(false), | |
45 | use_module_hash_(false), | |
46 | num_hash_func_(0), | |
47 | unused_key_(""), | |
48 | key_length_(0), | |
49 | user_key_length_(0), | |
50 | value_length_(0), | |
51 | bucket_length_(0), | |
52 | cuckoo_block_size_(0), | |
53 | cuckoo_block_bytes_minus_one_(0), | |
54 | table_size_(0), | |
7c673cae FG |
55 | ucomp_(comparator), |
56 | get_slice_hash_(get_slice_hash) { | |
57 | if (!ioptions.allow_mmap_reads) { | |
58 | status_ = Status::InvalidArgument("File is not mmaped"); | |
20effc67 | 59 | return; |
7c673cae | 60 | } |
1e59de90 TL |
61 | { |
62 | std::unique_ptr<TableProperties> props; | |
63 | status_ = ReadTableProperties(file_.get(), file_size, | |
64 | kCuckooTableMagicNumber, ioptions, &props); | |
65 | if (!status_.ok()) { | |
66 | return; | |
67 | } | |
68 | table_props_ = std::move(props); | |
7c673cae | 69 | } |
1e59de90 | 70 | auto& user_props = table_props_->user_collected_properties; |
7c673cae FG |
71 | auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); |
72 | if (hash_funs == user_props.end()) { | |
73 | status_ = Status::Corruption("Number of hash functions not found"); | |
74 | return; | |
75 | } | |
76 | num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); | |
77 | auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); | |
78 | if (unused_key == user_props.end()) { | |
79 | status_ = Status::Corruption("Empty bucket value not found"); | |
80 | return; | |
81 | } | |
82 | unused_key_ = unused_key->second; | |
83 | ||
1e59de90 | 84 | key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len); |
7c673cae FG |
85 | auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); |
86 | if (user_key_len == user_props.end()) { | |
87 | status_ = Status::Corruption("User key length not found"); | |
88 | return; | |
89 | } | |
1e59de90 TL |
90 | user_key_length_ = |
91 | *reinterpret_cast<const uint32_t*>(user_key_len->second.data()); | |
7c673cae FG |
92 | |
93 | auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); | |
94 | if (value_length == user_props.end()) { | |
95 | status_ = Status::Corruption("Value length not found"); | |
96 | return; | |
97 | } | |
1e59de90 TL |
98 | value_length_ = |
99 | *reinterpret_cast<const uint32_t*>(value_length->second.data()); | |
7c673cae FG |
100 | bucket_length_ = key_length_ + value_length_; |
101 | ||
1e59de90 TL |
102 | auto hash_table_size = |
103 | user_props.find(CuckooTablePropertyNames::kHashTableSize); | |
7c673cae FG |
104 | if (hash_table_size == user_props.end()) { |
105 | status_ = Status::Corruption("Hash table size not found"); | |
106 | return; | |
107 | } | |
1e59de90 TL |
108 | table_size_ = |
109 | *reinterpret_cast<const uint64_t*>(hash_table_size->second.data()); | |
7c673cae FG |
110 | |
111 | auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); | |
112 | if (is_last_level == user_props.end()) { | |
113 | status_ = Status::Corruption("Is last level not found"); | |
114 | return; | |
115 | } | |
116 | is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); | |
117 | ||
1e59de90 TL |
118 | auto identity_as_first_hash = |
119 | user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash); | |
7c673cae FG |
120 | if (identity_as_first_hash == user_props.end()) { |
121 | status_ = Status::Corruption("identity as first hash not found"); | |
122 | return; | |
123 | } | |
1e59de90 TL |
124 | identity_as_first_hash_ = |
125 | *reinterpret_cast<const bool*>(identity_as_first_hash->second.data()); | |
7c673cae | 126 | |
1e59de90 TL |
127 | auto use_module_hash = |
128 | user_props.find(CuckooTablePropertyNames::kUseModuleHash); | |
7c673cae FG |
129 | if (use_module_hash == user_props.end()) { |
130 | status_ = Status::Corruption("hash type is not found"); | |
131 | return; | |
132 | } | |
1e59de90 TL |
133 | use_module_hash_ = |
134 | *reinterpret_cast<const bool*>(use_module_hash->second.data()); | |
135 | auto cuckoo_block_size = | |
136 | user_props.find(CuckooTablePropertyNames::kCuckooBlockSize); | |
7c673cae FG |
137 | if (cuckoo_block_size == user_props.end()) { |
138 | status_ = Status::Corruption("Cuckoo block size not found"); | |
139 | return; | |
140 | } | |
1e59de90 TL |
141 | cuckoo_block_size_ = |
142 | *reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data()); | |
7c673cae | 143 | cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; |
1e59de90 TL |
144 | // TODO: rate limit reads of whole cuckoo tables. |
145 | status_ = | |
146 | file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), &file_data_, | |
147 | nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); | |
7c673cae FG |
148 | } |
149 | ||
11fdf7f2 TL |
150 | Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, |
151 | const Slice& key, GetContext* get_context, | |
152 | const SliceTransform* /* prefix_extractor */, | |
153 | bool /*skip_filters*/) { | |
7c673cae FG |
154 | assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); |
155 | Slice user_key = ExtractUserKey(key); | |
156 | for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { | |
1e59de90 TL |
157 | uint64_t offset = |
158 | bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_, | |
159 | table_size_, identity_as_first_hash_, | |
160 | get_slice_hash_); | |
7c673cae FG |
161 | const char* bucket = &file_data_.data()[offset]; |
162 | for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; | |
163 | ++block_idx, bucket += bucket_length_) { | |
164 | if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), | |
165 | Slice(bucket, user_key.size()))) { | |
166 | return Status::OK(); | |
167 | } | |
168 | // Here, we compare only the user key part as we support only one entry | |
169 | // per user key and we don't support snapshot. | |
170 | if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { | |
171 | Slice value(bucket + key_length_, value_length_); | |
172 | if (is_last_level_) { | |
173 | // Sequence number is not stored at the last level, so we will use | |
174 | // kMaxSequenceNumber since it is unknown. This could cause some | |
175 | // transactions to fail to lock a key due to known sequence number. | |
176 | // However, it is expected for anyone to use a CuckooTable in a | |
177 | // TransactionDB. | |
178 | get_context->SaveValue(value, kMaxSequenceNumber); | |
179 | } else { | |
180 | Slice full_key(bucket, key_length_); | |
181 | ParsedInternalKey found_ikey; | |
20effc67 TL |
182 | Status s = ParseInternalKey(full_key, &found_ikey, |
183 | false /* log_err_key */); // TODO | |
184 | if (!s.ok()) return s; | |
11fdf7f2 TL |
185 | bool dont_care __attribute__((__unused__)); |
186 | get_context->SaveValue(found_ikey, value, &dont_care); | |
7c673cae FG |
187 | } |
188 | // We don't support merge operations. So, we return here. | |
189 | return Status::OK(); | |
190 | } | |
191 | } | |
192 | } | |
193 | return Status::OK(); | |
194 | } | |
195 | ||
196 | void CuckooTableReader::Prepare(const Slice& key) { | |
197 | // Prefetch the first Cuckoo Block. | |
198 | Slice user_key = ExtractUserKey(key); | |
1e59de90 TL |
199 | uint64_t addr = |
200 | reinterpret_cast<uint64_t>(file_data_.data()) + | |
201 | bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, | |
202 | identity_as_first_hash_, nullptr); | |
7c673cae FG |
203 | uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; |
204 | for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { | |
205 | PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); | |
206 | } | |
207 | } | |
208 | ||
209 | class CuckooTableIterator : public InternalIterator { | |
210 | public: | |
211 | explicit CuckooTableIterator(CuckooTableReader* reader); | |
f67539c2 TL |
212 | // No copying allowed |
213 | CuckooTableIterator(const CuckooTableIterator&) = delete; | |
214 | void operator=(const Iterator&) = delete; | |
494da23a | 215 | ~CuckooTableIterator() override {} |
7c673cae FG |
216 | bool Valid() const override; |
217 | void SeekToFirst() override; | |
218 | void SeekToLast() override; | |
219 | void Seek(const Slice& target) override; | |
220 | void SeekForPrev(const Slice& target) override; | |
221 | void Next() override; | |
222 | void Prev() override; | |
223 | Slice key() const override; | |
224 | Slice value() const override; | |
11fdf7f2 | 225 | Status status() const override { return Status::OK(); } |
7c673cae FG |
226 | void InitIfNeeded(); |
227 | ||
228 | private: | |
229 | struct BucketComparator { | |
230 | BucketComparator(const Slice& file_data, const Comparator* ucomp, | |
231 | uint32_t bucket_len, uint32_t user_key_len, | |
232 | const Slice& target = Slice()) | |
1e59de90 TL |
233 | : file_data_(file_data), |
234 | ucomp_(ucomp), | |
235 | bucket_len_(bucket_len), | |
236 | user_key_len_(user_key_len), | |
237 | target_(target) {} | |
7c673cae | 238 | bool operator()(const uint32_t first, const uint32_t second) const { |
1e59de90 TL |
239 | const char* first_bucket = (first == kInvalidIndex) |
240 | ? target_.data() | |
241 | : &file_data_.data()[first * bucket_len_]; | |
7c673cae | 242 | const char* second_bucket = |
1e59de90 TL |
243 | (second == kInvalidIndex) ? target_.data() |
244 | : &file_data_.data()[second * bucket_len_]; | |
7c673cae FG |
245 | return ucomp_->Compare(Slice(first_bucket, user_key_len_), |
246 | Slice(second_bucket, user_key_len_)) < 0; | |
247 | } | |
1e59de90 | 248 | |
7c673cae FG |
249 | private: |
250 | const Slice file_data_; | |
251 | const Comparator* ucomp_; | |
252 | const uint32_t bucket_len_; | |
253 | const uint32_t user_key_len_; | |
254 | const Slice target_; | |
255 | }; | |
256 | ||
257 | const BucketComparator bucket_comparator_; | |
258 | void PrepareKVAtCurrIdx(); | |
259 | CuckooTableReader* reader_; | |
260 | bool initialized_; | |
7c673cae FG |
261 | // Contains a map of keys to bucket_id sorted in key order. |
262 | std::vector<uint32_t> sorted_bucket_ids_; | |
263 | // We assume that the number of items can be stored in uint32 (4 Billion). | |
264 | uint32_t curr_key_idx_; | |
265 | Slice curr_value_; | |
266 | IterKey curr_key_; | |
7c673cae FG |
267 | }; |
268 | ||
269 | CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) | |
1e59de90 TL |
270 | : bucket_comparator_(reader->file_data_, reader->ucomp_, |
271 | reader->bucket_length_, reader->user_key_length_), | |
272 | reader_(reader), | |
273 | initialized_(false), | |
274 | curr_key_idx_(kInvalidIndex) { | |
7c673cae FG |
275 | sorted_bucket_ids_.clear(); |
276 | curr_value_.clear(); | |
277 | curr_key_.Clear(); | |
278 | } | |
279 | ||
280 | void CuckooTableIterator::InitIfNeeded() { | |
281 | if (initialized_) { | |
282 | return; | |
283 | } | |
1e59de90 TL |
284 | sorted_bucket_ids_.reserve( |
285 | static_cast<size_t>(reader_->GetTableProperties()->num_entries)); | |
7c673cae FG |
286 | uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; |
287 | assert(num_buckets < kInvalidIndex); | |
288 | const char* bucket = reader_->file_data_.data(); | |
289 | for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { | |
290 | if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { | |
291 | sorted_bucket_ids_.push_back(bucket_id); | |
292 | } | |
293 | bucket += reader_->bucket_length_; | |
294 | } | |
295 | assert(sorted_bucket_ids_.size() == | |
1e59de90 | 296 | reader_->GetTableProperties()->num_entries); |
7c673cae FG |
297 | std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), |
298 | bucket_comparator_); | |
299 | curr_key_idx_ = kInvalidIndex; | |
300 | initialized_ = true; | |
301 | } | |
302 | ||
303 | void CuckooTableIterator::SeekToFirst() { | |
304 | InitIfNeeded(); | |
305 | curr_key_idx_ = 0; | |
306 | PrepareKVAtCurrIdx(); | |
307 | } | |
308 | ||
309 | void CuckooTableIterator::SeekToLast() { | |
310 | InitIfNeeded(); | |
311 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; | |
312 | PrepareKVAtCurrIdx(); | |
313 | } | |
314 | ||
315 | void CuckooTableIterator::Seek(const Slice& target) { | |
316 | InitIfNeeded(); | |
317 | const BucketComparator seek_comparator( | |
1e59de90 TL |
318 | reader_->file_data_, reader_->ucomp_, reader_->bucket_length_, |
319 | reader_->user_key_length_, ExtractUserKey(target)); | |
320 | auto seek_it = | |
321 | std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), | |
322 | kInvalidIndex, seek_comparator); | |
7c673cae FG |
323 | curr_key_idx_ = |
324 | static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); | |
325 | PrepareKVAtCurrIdx(); | |
326 | } | |
327 | ||
11fdf7f2 | 328 | void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { |
7c673cae FG |
329 | // Not supported |
330 | assert(false); | |
331 | } | |
332 | ||
333 | bool CuckooTableIterator::Valid() const { | |
334 | return curr_key_idx_ < sorted_bucket_ids_.size(); | |
335 | } | |
336 | ||
337 | void CuckooTableIterator::PrepareKVAtCurrIdx() { | |
338 | if (!Valid()) { | |
339 | curr_value_.clear(); | |
340 | curr_key_.Clear(); | |
341 | return; | |
342 | } | |
343 | uint32_t id = sorted_bucket_ids_[curr_key_idx_]; | |
1e59de90 TL |
344 | const char* offset = |
345 | reader_->file_data_.data() + id * reader_->bucket_length_; | |
7c673cae FG |
346 | if (reader_->is_last_level_) { |
347 | // Always return internal key. | |
1e59de90 TL |
348 | curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0, |
349 | kTypeValue); | |
7c673cae FG |
350 | } else { |
351 | curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); | |
352 | } | |
353 | curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); | |
354 | } | |
355 | ||
356 | void CuckooTableIterator::Next() { | |
357 | if (!Valid()) { | |
358 | curr_value_.clear(); | |
359 | curr_key_.Clear(); | |
360 | return; | |
361 | } | |
362 | ++curr_key_idx_; | |
363 | PrepareKVAtCurrIdx(); | |
364 | } | |
365 | ||
366 | void CuckooTableIterator::Prev() { | |
367 | if (curr_key_idx_ == 0) { | |
368 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); | |
369 | } | |
370 | if (!Valid()) { | |
371 | curr_value_.clear(); | |
372 | curr_key_.Clear(); | |
373 | return; | |
374 | } | |
375 | --curr_key_idx_; | |
376 | PrepareKVAtCurrIdx(); | |
377 | } | |
378 | ||
379 | Slice CuckooTableIterator::key() const { | |
380 | assert(Valid()); | |
381 | return curr_key_.GetInternalKey(); | |
382 | } | |
383 | ||
384 | Slice CuckooTableIterator::value() const { | |
385 | assert(Valid()); | |
386 | return curr_value_; | |
387 | } | |
388 | ||
7c673cae | 389 | InternalIterator* CuckooTableReader::NewIterator( |
11fdf7f2 TL |
390 | const ReadOptions& /*read_options*/, |
391 | const SliceTransform* /* prefix_extractor */, Arena* arena, | |
f67539c2 | 392 | bool /*skip_filters*/, TableReaderCaller /*caller*/, |
1e59de90 | 393 | size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { |
7c673cae | 394 | if (!status().ok()) { |
11fdf7f2 | 395 | return NewErrorInternalIterator<Slice>( |
7c673cae FG |
396 | Status::Corruption("CuckooTableReader status is not okay."), arena); |
397 | } | |
7c673cae FG |
398 | CuckooTableIterator* iter; |
399 | if (arena == nullptr) { | |
400 | iter = new CuckooTableIterator(this); | |
401 | } else { | |
402 | auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); | |
403 | iter = new (iter_mem) CuckooTableIterator(this); | |
404 | } | |
405 | return iter; | |
406 | } | |
407 | ||
408 | size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } | |
409 | ||
f67539c2 | 410 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 411 | #endif |