]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / cuckoo / cuckoo_table_reader.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_LITE
f67539c2 11#include "table/cuckoo/cuckoo_table_reader.h"
7c673cae
FG
12
13#include <algorithm>
14#include <limits>
15#include <string>
16#include <utility>
17#include <vector>
f67539c2 18#include "memory/arena.h"
7c673cae
FG
19#include "rocksdb/iterator.h"
20#include "rocksdb/table.h"
f67539c2
TL
21#include "table/cuckoo/cuckoo_table_factory.h"
22#include "table/get_context.h"
7c673cae
FG
23#include "table/internal_iterator.h"
24#include "table/meta_blocks.h"
7c673cae
FG
25#include "util/coding.h"
26
f67539c2 27namespace ROCKSDB_NAMESPACE {
7c673cae
FG
28namespace {
29const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
30const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
31}
32
33extern const uint64_t kCuckooTableMagicNumber;
34
35CuckooTableReader::CuckooTableReader(
36 const ImmutableCFOptions& ioptions,
37 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
38 const Comparator* comparator,
39 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
40 : file_(std::move(file)),
11fdf7f2
TL
41 is_last_level_(false),
42 identity_as_first_hash_(false),
43 use_module_hash_(false),
44 num_hash_func_(0),
45 unused_key_(""),
46 key_length_(0),
47 user_key_length_(0),
48 value_length_(0),
49 bucket_length_(0),
50 cuckoo_block_size_(0),
51 cuckoo_block_bytes_minus_one_(0),
52 table_size_(0),
7c673cae
FG
53 ucomp_(comparator),
54 get_slice_hash_(get_slice_hash) {
55 if (!ioptions.allow_mmap_reads) {
56 status_ = Status::InvalidArgument("File is not mmaped");
20effc67 57 return;
7c673cae
FG
58 }
59 TableProperties* props = nullptr;
60 status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
11fdf7f2 61 ioptions, &props, true /* compression_type_missing */);
7c673cae
FG
62 if (!status_.ok()) {
63 return;
64 }
65 table_props_.reset(props);
66 auto& user_props = props->user_collected_properties;
67 auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
68 if (hash_funs == user_props.end()) {
69 status_ = Status::Corruption("Number of hash functions not found");
70 return;
71 }
72 num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
73 auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
74 if (unused_key == user_props.end()) {
75 status_ = Status::Corruption("Empty bucket value not found");
76 return;
77 }
78 unused_key_ = unused_key->second;
79
80 key_length_ = static_cast<uint32_t>(props->fixed_key_len);
81 auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
82 if (user_key_len == user_props.end()) {
83 status_ = Status::Corruption("User key length not found");
84 return;
85 }
86 user_key_length_ = *reinterpret_cast<const uint32_t*>(
87 user_key_len->second.data());
88
89 auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
90 if (value_length == user_props.end()) {
91 status_ = Status::Corruption("Value length not found");
92 return;
93 }
94 value_length_ = *reinterpret_cast<const uint32_t*>(
95 value_length->second.data());
96 bucket_length_ = key_length_ + value_length_;
97
98 auto hash_table_size = user_props.find(
99 CuckooTablePropertyNames::kHashTableSize);
100 if (hash_table_size == user_props.end()) {
101 status_ = Status::Corruption("Hash table size not found");
102 return;
103 }
104 table_size_ = *reinterpret_cast<const uint64_t*>(
105 hash_table_size->second.data());
106
107 auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
108 if (is_last_level == user_props.end()) {
109 status_ = Status::Corruption("Is last level not found");
110 return;
111 }
112 is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
113
114 auto identity_as_first_hash = user_props.find(
115 CuckooTablePropertyNames::kIdentityAsFirstHash);
116 if (identity_as_first_hash == user_props.end()) {
117 status_ = Status::Corruption("identity as first hash not found");
118 return;
119 }
120 identity_as_first_hash_ = *reinterpret_cast<const bool*>(
121 identity_as_first_hash->second.data());
122
123 auto use_module_hash = user_props.find(
124 CuckooTablePropertyNames::kUseModuleHash);
125 if (use_module_hash == user_props.end()) {
126 status_ = Status::Corruption("hash type is not found");
127 return;
128 }
129 use_module_hash_ = *reinterpret_cast<const bool*>(
130 use_module_hash->second.data());
131 auto cuckoo_block_size = user_props.find(
132 CuckooTablePropertyNames::kCuckooBlockSize);
133 if (cuckoo_block_size == user_props.end()) {
134 status_ = Status::Corruption("Cuckoo block size not found");
135 return;
136 }
137 cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
138 cuckoo_block_size->second.data());
139 cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
20effc67
TL
140 status_ = file_->Read(IOOptions(), 0, static_cast<size_t>(file_size),
141 &file_data_, nullptr, nullptr);
7c673cae
FG
142}
143
11fdf7f2
TL
144Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
145 const Slice& key, GetContext* get_context,
146 const SliceTransform* /* prefix_extractor */,
147 bool /*skip_filters*/) {
7c673cae
FG
148 assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
149 Slice user_key = ExtractUserKey(key);
150 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
151 uint64_t offset = bucket_length_ * CuckooHash(
152 user_key, hash_cnt, use_module_hash_, table_size_,
153 identity_as_first_hash_, get_slice_hash_);
154 const char* bucket = &file_data_.data()[offset];
155 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
156 ++block_idx, bucket += bucket_length_) {
157 if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
158 Slice(bucket, user_key.size()))) {
159 return Status::OK();
160 }
161 // Here, we compare only the user key part as we support only one entry
162 // per user key and we don't support snapshot.
163 if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
164 Slice value(bucket + key_length_, value_length_);
165 if (is_last_level_) {
166 // Sequence number is not stored at the last level, so we will use
167 // kMaxSequenceNumber since it is unknown. This could cause some
168 // transactions to fail to lock a key due to known sequence number.
169 // However, it is expected for anyone to use a CuckooTable in a
170 // TransactionDB.
171 get_context->SaveValue(value, kMaxSequenceNumber);
172 } else {
173 Slice full_key(bucket, key_length_);
174 ParsedInternalKey found_ikey;
20effc67
TL
175 Status s = ParseInternalKey(full_key, &found_ikey,
176 false /* log_err_key */); // TODO
177 if (!s.ok()) return s;
11fdf7f2
TL
178 bool dont_care __attribute__((__unused__));
179 get_context->SaveValue(found_ikey, value, &dont_care);
7c673cae
FG
180 }
181 // We don't support merge operations. So, we return here.
182 return Status::OK();
183 }
184 }
185 }
186 return Status::OK();
187}
188
189void CuckooTableReader::Prepare(const Slice& key) {
190 // Prefetch the first Cuckoo Block.
191 Slice user_key = ExtractUserKey(key);
192 uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
193 bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
194 identity_as_first_hash_, nullptr);
195 uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
196 for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
197 PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
198 }
199}
200
201class CuckooTableIterator : public InternalIterator {
202 public:
203 explicit CuckooTableIterator(CuckooTableReader* reader);
f67539c2
TL
204 // No copying allowed
205 CuckooTableIterator(const CuckooTableIterator&) = delete;
206 void operator=(const Iterator&) = delete;
494da23a 207 ~CuckooTableIterator() override {}
7c673cae
FG
208 bool Valid() const override;
209 void SeekToFirst() override;
210 void SeekToLast() override;
211 void Seek(const Slice& target) override;
212 void SeekForPrev(const Slice& target) override;
213 void Next() override;
214 void Prev() override;
215 Slice key() const override;
216 Slice value() const override;
11fdf7f2 217 Status status() const override { return Status::OK(); }
7c673cae
FG
218 void InitIfNeeded();
219
220 private:
221 struct BucketComparator {
222 BucketComparator(const Slice& file_data, const Comparator* ucomp,
223 uint32_t bucket_len, uint32_t user_key_len,
224 const Slice& target = Slice())
225 : file_data_(file_data),
226 ucomp_(ucomp),
227 bucket_len_(bucket_len),
228 user_key_len_(user_key_len),
229 target_(target) {}
230 bool operator()(const uint32_t first, const uint32_t second) const {
231 const char* first_bucket =
232 (first == kInvalidIndex) ? target_.data() :
233 &file_data_.data()[first * bucket_len_];
234 const char* second_bucket =
235 (second == kInvalidIndex) ? target_.data() :
236 &file_data_.data()[second * bucket_len_];
237 return ucomp_->Compare(Slice(first_bucket, user_key_len_),
238 Slice(second_bucket, user_key_len_)) < 0;
239 }
240 private:
241 const Slice file_data_;
242 const Comparator* ucomp_;
243 const uint32_t bucket_len_;
244 const uint32_t user_key_len_;
245 const Slice target_;
246 };
247
248 const BucketComparator bucket_comparator_;
249 void PrepareKVAtCurrIdx();
250 CuckooTableReader* reader_;
251 bool initialized_;
7c673cae
FG
252 // Contains a map of keys to bucket_id sorted in key order.
253 std::vector<uint32_t> sorted_bucket_ids_;
254 // We assume that the number of items can be stored in uint32 (4 Billion).
255 uint32_t curr_key_idx_;
256 Slice curr_value_;
257 IterKey curr_key_;
7c673cae
FG
258};
259
260CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
261 : bucket_comparator_(reader->file_data_, reader->ucomp_,
262 reader->bucket_length_, reader->user_key_length_),
263 reader_(reader),
264 initialized_(false),
265 curr_key_idx_(kInvalidIndex) {
266 sorted_bucket_ids_.clear();
267 curr_value_.clear();
268 curr_key_.Clear();
269}
270
271void CuckooTableIterator::InitIfNeeded() {
272 if (initialized_) {
273 return;
274 }
11fdf7f2 275 sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
7c673cae
FG
276 uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
277 assert(num_buckets < kInvalidIndex);
278 const char* bucket = reader_->file_data_.data();
279 for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
280 if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
281 sorted_bucket_ids_.push_back(bucket_id);
282 }
283 bucket += reader_->bucket_length_;
284 }
285 assert(sorted_bucket_ids_.size() ==
286 reader_->GetTableProperties()->num_entries);
287 std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
288 bucket_comparator_);
289 curr_key_idx_ = kInvalidIndex;
290 initialized_ = true;
291}
292
293void CuckooTableIterator::SeekToFirst() {
294 InitIfNeeded();
295 curr_key_idx_ = 0;
296 PrepareKVAtCurrIdx();
297}
298
299void CuckooTableIterator::SeekToLast() {
300 InitIfNeeded();
301 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
302 PrepareKVAtCurrIdx();
303}
304
305void CuckooTableIterator::Seek(const Slice& target) {
306 InitIfNeeded();
307 const BucketComparator seek_comparator(
308 reader_->file_data_, reader_->ucomp_,
309 reader_->bucket_length_, reader_->user_key_length_,
310 ExtractUserKey(target));
311 auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
312 sorted_bucket_ids_.end(),
313 kInvalidIndex,
314 seek_comparator);
315 curr_key_idx_ =
316 static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
317 PrepareKVAtCurrIdx();
318}
319
11fdf7f2 320void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
7c673cae
FG
321 // Not supported
322 assert(false);
323}
324
325bool CuckooTableIterator::Valid() const {
326 return curr_key_idx_ < sorted_bucket_ids_.size();
327}
328
329void CuckooTableIterator::PrepareKVAtCurrIdx() {
330 if (!Valid()) {
331 curr_value_.clear();
332 curr_key_.Clear();
333 return;
334 }
335 uint32_t id = sorted_bucket_ids_[curr_key_idx_];
336 const char* offset = reader_->file_data_.data() +
337 id * reader_->bucket_length_;
338 if (reader_->is_last_level_) {
339 // Always return internal key.
340 curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
341 0, kTypeValue);
342 } else {
343 curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
344 }
345 curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
346}
347
348void CuckooTableIterator::Next() {
349 if (!Valid()) {
350 curr_value_.clear();
351 curr_key_.Clear();
352 return;
353 }
354 ++curr_key_idx_;
355 PrepareKVAtCurrIdx();
356}
357
358void CuckooTableIterator::Prev() {
359 if (curr_key_idx_ == 0) {
360 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
361 }
362 if (!Valid()) {
363 curr_value_.clear();
364 curr_key_.Clear();
365 return;
366 }
367 --curr_key_idx_;
368 PrepareKVAtCurrIdx();
369}
370
371Slice CuckooTableIterator::key() const {
372 assert(Valid());
373 return curr_key_.GetInternalKey();
374}
375
376Slice CuckooTableIterator::value() const {
377 assert(Valid());
378 return curr_value_;
379}
380
7c673cae 381InternalIterator* CuckooTableReader::NewIterator(
11fdf7f2
TL
382 const ReadOptions& /*read_options*/,
383 const SliceTransform* /* prefix_extractor */, Arena* arena,
f67539c2 384 bool /*skip_filters*/, TableReaderCaller /*caller*/,
20effc67
TL
385 size_t /*compaction_readahead_size*/,
386 bool /* allow_unprepared_value */) {
7c673cae 387 if (!status().ok()) {
11fdf7f2 388 return NewErrorInternalIterator<Slice>(
7c673cae
FG
389 Status::Corruption("CuckooTableReader status is not okay."), arena);
390 }
7c673cae
FG
391 CuckooTableIterator* iter;
392 if (arena == nullptr) {
393 iter = new CuckooTableIterator(this);
394 } else {
395 auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
396 iter = new (iter_mem) CuckooTableIterator(this);
397 }
398 return iter;
399}
400
401size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
402
f67539c2 403} // namespace ROCKSDB_NAMESPACE
7c673cae 404#endif