]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/cuckoo_table_reader.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / table / cuckoo_table_reader.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifndef ROCKSDB_LITE
11 #include "table/cuckoo_table_reader.h"
12
13 #include <algorithm>
14 #include <limits>
15 #include <string>
16 #include <utility>
17 #include <vector>
18 #include "rocksdb/iterator.h"
19 #include "rocksdb/table.h"
20 #include "table/internal_iterator.h"
21 #include "table/meta_blocks.h"
22 #include "table/cuckoo_table_factory.h"
23 #include "table/get_context.h"
24 #include "util/arena.h"
25 #include "util/coding.h"
26
27 namespace rocksdb {
28 namespace {
29 const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
30 const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
31 }
32
33 extern const uint64_t kCuckooTableMagicNumber;
34
35 CuckooTableReader::CuckooTableReader(
36 const ImmutableCFOptions& ioptions,
37 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
38 const Comparator* comparator,
39 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
40 : file_(std::move(file)),
41 is_last_level_(false),
42 identity_as_first_hash_(false),
43 use_module_hash_(false),
44 num_hash_func_(0),
45 unused_key_(""),
46 key_length_(0),
47 user_key_length_(0),
48 value_length_(0),
49 bucket_length_(0),
50 cuckoo_block_size_(0),
51 cuckoo_block_bytes_minus_one_(0),
52 table_size_(0),
53 ucomp_(comparator),
54 get_slice_hash_(get_slice_hash) {
55 if (!ioptions.allow_mmap_reads) {
56 status_ = Status::InvalidArgument("File is not mmaped");
57 }
58 TableProperties* props = nullptr;
59 status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
60 ioptions, &props, true /* compression_type_missing */);
61 if (!status_.ok()) {
62 return;
63 }
64 table_props_.reset(props);
65 auto& user_props = props->user_collected_properties;
66 auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
67 if (hash_funs == user_props.end()) {
68 status_ = Status::Corruption("Number of hash functions not found");
69 return;
70 }
71 num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
72 auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
73 if (unused_key == user_props.end()) {
74 status_ = Status::Corruption("Empty bucket value not found");
75 return;
76 }
77 unused_key_ = unused_key->second;
78
79 key_length_ = static_cast<uint32_t>(props->fixed_key_len);
80 auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
81 if (user_key_len == user_props.end()) {
82 status_ = Status::Corruption("User key length not found");
83 return;
84 }
85 user_key_length_ = *reinterpret_cast<const uint32_t*>(
86 user_key_len->second.data());
87
88 auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
89 if (value_length == user_props.end()) {
90 status_ = Status::Corruption("Value length not found");
91 return;
92 }
93 value_length_ = *reinterpret_cast<const uint32_t*>(
94 value_length->second.data());
95 bucket_length_ = key_length_ + value_length_;
96
97 auto hash_table_size = user_props.find(
98 CuckooTablePropertyNames::kHashTableSize);
99 if (hash_table_size == user_props.end()) {
100 status_ = Status::Corruption("Hash table size not found");
101 return;
102 }
103 table_size_ = *reinterpret_cast<const uint64_t*>(
104 hash_table_size->second.data());
105
106 auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
107 if (is_last_level == user_props.end()) {
108 status_ = Status::Corruption("Is last level not found");
109 return;
110 }
111 is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
112
113 auto identity_as_first_hash = user_props.find(
114 CuckooTablePropertyNames::kIdentityAsFirstHash);
115 if (identity_as_first_hash == user_props.end()) {
116 status_ = Status::Corruption("identity as first hash not found");
117 return;
118 }
119 identity_as_first_hash_ = *reinterpret_cast<const bool*>(
120 identity_as_first_hash->second.data());
121
122 auto use_module_hash = user_props.find(
123 CuckooTablePropertyNames::kUseModuleHash);
124 if (use_module_hash == user_props.end()) {
125 status_ = Status::Corruption("hash type is not found");
126 return;
127 }
128 use_module_hash_ = *reinterpret_cast<const bool*>(
129 use_module_hash->second.data());
130 auto cuckoo_block_size = user_props.find(
131 CuckooTablePropertyNames::kCuckooBlockSize);
132 if (cuckoo_block_size == user_props.end()) {
133 status_ = Status::Corruption("Cuckoo block size not found");
134 return;
135 }
136 cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
137 cuckoo_block_size->second.data());
138 cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
139 status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
140 }
141
142 Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
143 const Slice& key, GetContext* get_context,
144 const SliceTransform* /* prefix_extractor */,
145 bool /*skip_filters*/) {
146 assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
147 Slice user_key = ExtractUserKey(key);
148 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
149 uint64_t offset = bucket_length_ * CuckooHash(
150 user_key, hash_cnt, use_module_hash_, table_size_,
151 identity_as_first_hash_, get_slice_hash_);
152 const char* bucket = &file_data_.data()[offset];
153 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
154 ++block_idx, bucket += bucket_length_) {
155 if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
156 Slice(bucket, user_key.size()))) {
157 return Status::OK();
158 }
159 // Here, we compare only the user key part as we support only one entry
160 // per user key and we don't support snapshot.
161 if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
162 Slice value(bucket + key_length_, value_length_);
163 if (is_last_level_) {
164 // Sequence number is not stored at the last level, so we will use
165 // kMaxSequenceNumber since it is unknown. This could cause some
166 // transactions to fail to lock a key due to known sequence number.
167 // However, it is expected for anyone to use a CuckooTable in a
168 // TransactionDB.
169 get_context->SaveValue(value, kMaxSequenceNumber);
170 } else {
171 Slice full_key(bucket, key_length_);
172 ParsedInternalKey found_ikey;
173 ParseInternalKey(full_key, &found_ikey);
174 bool dont_care __attribute__((__unused__));
175 get_context->SaveValue(found_ikey, value, &dont_care);
176 }
177 // We don't support merge operations. So, we return here.
178 return Status::OK();
179 }
180 }
181 }
182 return Status::OK();
183 }
184
185 void CuckooTableReader::Prepare(const Slice& key) {
186 // Prefetch the first Cuckoo Block.
187 Slice user_key = ExtractUserKey(key);
188 uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
189 bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
190 identity_as_first_hash_, nullptr);
191 uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
192 for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
193 PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
194 }
195 }
196
197 class CuckooTableIterator : public InternalIterator {
198 public:
199 explicit CuckooTableIterator(CuckooTableReader* reader);
200 ~CuckooTableIterator() override {}
201 bool Valid() const override;
202 void SeekToFirst() override;
203 void SeekToLast() override;
204 void Seek(const Slice& target) override;
205 void SeekForPrev(const Slice& target) override;
206 void Next() override;
207 void Prev() override;
208 Slice key() const override;
209 Slice value() const override;
210 Status status() const override { return Status::OK(); }
211 void InitIfNeeded();
212
213 private:
214 struct BucketComparator {
215 BucketComparator(const Slice& file_data, const Comparator* ucomp,
216 uint32_t bucket_len, uint32_t user_key_len,
217 const Slice& target = Slice())
218 : file_data_(file_data),
219 ucomp_(ucomp),
220 bucket_len_(bucket_len),
221 user_key_len_(user_key_len),
222 target_(target) {}
223 bool operator()(const uint32_t first, const uint32_t second) const {
224 const char* first_bucket =
225 (first == kInvalidIndex) ? target_.data() :
226 &file_data_.data()[first * bucket_len_];
227 const char* second_bucket =
228 (second == kInvalidIndex) ? target_.data() :
229 &file_data_.data()[second * bucket_len_];
230 return ucomp_->Compare(Slice(first_bucket, user_key_len_),
231 Slice(second_bucket, user_key_len_)) < 0;
232 }
233 private:
234 const Slice file_data_;
235 const Comparator* ucomp_;
236 const uint32_t bucket_len_;
237 const uint32_t user_key_len_;
238 const Slice target_;
239 };
240
241 const BucketComparator bucket_comparator_;
242 void PrepareKVAtCurrIdx();
243 CuckooTableReader* reader_;
244 bool initialized_;
245 // Contains a map of keys to bucket_id sorted in key order.
246 std::vector<uint32_t> sorted_bucket_ids_;
247 // We assume that the number of items can be stored in uint32 (4 Billion).
248 uint32_t curr_key_idx_;
249 Slice curr_value_;
250 IterKey curr_key_;
251 // No copying allowed
252 CuckooTableIterator(const CuckooTableIterator&) = delete;
253 void operator=(const Iterator&) = delete;
254 };
255
256 CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
257 : bucket_comparator_(reader->file_data_, reader->ucomp_,
258 reader->bucket_length_, reader->user_key_length_),
259 reader_(reader),
260 initialized_(false),
261 curr_key_idx_(kInvalidIndex) {
262 sorted_bucket_ids_.clear();
263 curr_value_.clear();
264 curr_key_.Clear();
265 }
266
267 void CuckooTableIterator::InitIfNeeded() {
268 if (initialized_) {
269 return;
270 }
271 sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
272 uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
273 assert(num_buckets < kInvalidIndex);
274 const char* bucket = reader_->file_data_.data();
275 for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
276 if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
277 sorted_bucket_ids_.push_back(bucket_id);
278 }
279 bucket += reader_->bucket_length_;
280 }
281 assert(sorted_bucket_ids_.size() ==
282 reader_->GetTableProperties()->num_entries);
283 std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
284 bucket_comparator_);
285 curr_key_idx_ = kInvalidIndex;
286 initialized_ = true;
287 }
288
289 void CuckooTableIterator::SeekToFirst() {
290 InitIfNeeded();
291 curr_key_idx_ = 0;
292 PrepareKVAtCurrIdx();
293 }
294
295 void CuckooTableIterator::SeekToLast() {
296 InitIfNeeded();
297 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
298 PrepareKVAtCurrIdx();
299 }
300
301 void CuckooTableIterator::Seek(const Slice& target) {
302 InitIfNeeded();
303 const BucketComparator seek_comparator(
304 reader_->file_data_, reader_->ucomp_,
305 reader_->bucket_length_, reader_->user_key_length_,
306 ExtractUserKey(target));
307 auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
308 sorted_bucket_ids_.end(),
309 kInvalidIndex,
310 seek_comparator);
311 curr_key_idx_ =
312 static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
313 PrepareKVAtCurrIdx();
314 }
315
316 void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
317 // Not supported
318 assert(false);
319 }
320
321 bool CuckooTableIterator::Valid() const {
322 return curr_key_idx_ < sorted_bucket_ids_.size();
323 }
324
325 void CuckooTableIterator::PrepareKVAtCurrIdx() {
326 if (!Valid()) {
327 curr_value_.clear();
328 curr_key_.Clear();
329 return;
330 }
331 uint32_t id = sorted_bucket_ids_[curr_key_idx_];
332 const char* offset = reader_->file_data_.data() +
333 id * reader_->bucket_length_;
334 if (reader_->is_last_level_) {
335 // Always return internal key.
336 curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
337 0, kTypeValue);
338 } else {
339 curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
340 }
341 curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
342 }
343
344 void CuckooTableIterator::Next() {
345 if (!Valid()) {
346 curr_value_.clear();
347 curr_key_.Clear();
348 return;
349 }
350 ++curr_key_idx_;
351 PrepareKVAtCurrIdx();
352 }
353
354 void CuckooTableIterator::Prev() {
355 if (curr_key_idx_ == 0) {
356 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
357 }
358 if (!Valid()) {
359 curr_value_.clear();
360 curr_key_.Clear();
361 return;
362 }
363 --curr_key_idx_;
364 PrepareKVAtCurrIdx();
365 }
366
367 Slice CuckooTableIterator::key() const {
368 assert(Valid());
369 return curr_key_.GetInternalKey();
370 }
371
372 Slice CuckooTableIterator::value() const {
373 assert(Valid());
374 return curr_value_;
375 }
376
377 InternalIterator* CuckooTableReader::NewIterator(
378 const ReadOptions& /*read_options*/,
379 const SliceTransform* /* prefix_extractor */, Arena* arena,
380 bool /*skip_filters*/, bool /*for_compaction*/) {
381 if (!status().ok()) {
382 return NewErrorInternalIterator<Slice>(
383 Status::Corruption("CuckooTableReader status is not okay."), arena);
384 }
385 CuckooTableIterator* iter;
386 if (arena == nullptr) {
387 iter = new CuckooTableIterator(this);
388 } else {
389 auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
390 iter = new (iter_mem) CuckooTableIterator(this);
391 }
392 return iter;
393 }
394
395 size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
396
397 } // namespace rocksdb
398 #endif