]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / cuckoo / cuckoo_table_reader.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_LITE
f67539c2 11#include "table/cuckoo/cuckoo_table_reader.h"
7c673cae
FG
12
13#include <algorithm>
14#include <limits>
15#include <string>
16#include <utility>
17#include <vector>
1e59de90 18
f67539c2 19#include "memory/arena.h"
1e59de90 20#include "options/cf_options.h"
7c673cae
FG
21#include "rocksdb/iterator.h"
22#include "rocksdb/table.h"
f67539c2
TL
23#include "table/cuckoo/cuckoo_table_factory.h"
24#include "table/get_context.h"
7c673cae
FG
25#include "table/internal_iterator.h"
26#include "table/meta_blocks.h"
7c673cae
FG
27#include "util/coding.h"
28
f67539c2 29namespace ROCKSDB_NAMESPACE {
7c673cae
FG
30namespace {
31const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
32const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
1e59de90 33} // namespace
7c673cae
FG
34
35extern const uint64_t kCuckooTableMagicNumber;
36
37CuckooTableReader::CuckooTableReader(
1e59de90 38 const ImmutableOptions& ioptions,
7c673cae
FG
39 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
40 const Comparator* comparator,
41 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
42 : file_(std::move(file)),
11fdf7f2
TL
43 is_last_level_(false),
44 identity_as_first_hash_(false),
45 use_module_hash_(false),
46 num_hash_func_(0),
47 unused_key_(""),
48 key_length_(0),
49 user_key_length_(0),
50 value_length_(0),
51 bucket_length_(0),
52 cuckoo_block_size_(0),
53 cuckoo_block_bytes_minus_one_(0),
54 table_size_(0),
7c673cae
FG
55 ucomp_(comparator),
56 get_slice_hash_(get_slice_hash) {
57 if (!ioptions.allow_mmap_reads) {
58 status_ = Status::InvalidArgument("File is not mmaped");
20effc67 59 return;
7c673cae 60 }
1e59de90
TL
61 {
62 std::unique_ptr<TableProperties> props;
63 status_ = ReadTableProperties(file_.get(), file_size,
64 kCuckooTableMagicNumber, ioptions, &props);
65 if (!status_.ok()) {
66 return;
67 }
68 table_props_ = std::move(props);
7c673cae 69 }
1e59de90 70 auto& user_props = table_props_->user_collected_properties;
7c673cae
FG
71 auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
72 if (hash_funs == user_props.end()) {
73 status_ = Status::Corruption("Number of hash functions not found");
74 return;
75 }
76 num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
77 auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
78 if (unused_key == user_props.end()) {
79 status_ = Status::Corruption("Empty bucket value not found");
80 return;
81 }
82 unused_key_ = unused_key->second;
83
1e59de90 84 key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
7c673cae
FG
85 auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
86 if (user_key_len == user_props.end()) {
87 status_ = Status::Corruption("User key length not found");
88 return;
89 }
1e59de90
TL
90 user_key_length_ =
91 *reinterpret_cast<const uint32_t*>(user_key_len->second.data());
7c673cae
FG
92
93 auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
94 if (value_length == user_props.end()) {
95 status_ = Status::Corruption("Value length not found");
96 return;
97 }
1e59de90
TL
98 value_length_ =
99 *reinterpret_cast<const uint32_t*>(value_length->second.data());
7c673cae
FG
100 bucket_length_ = key_length_ + value_length_;
101
1e59de90
TL
102 auto hash_table_size =
103 user_props.find(CuckooTablePropertyNames::kHashTableSize);
7c673cae
FG
104 if (hash_table_size == user_props.end()) {
105 status_ = Status::Corruption("Hash table size not found");
106 return;
107 }
1e59de90
TL
108 table_size_ =
109 *reinterpret_cast<const uint64_t*>(hash_table_size->second.data());
7c673cae
FG
110
111 auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
112 if (is_last_level == user_props.end()) {
113 status_ = Status::Corruption("Is last level not found");
114 return;
115 }
116 is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
117
1e59de90
TL
118 auto identity_as_first_hash =
119 user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash);
7c673cae
FG
120 if (identity_as_first_hash == user_props.end()) {
121 status_ = Status::Corruption("identity as first hash not found");
122 return;
123 }
1e59de90
TL
124 identity_as_first_hash_ =
125 *reinterpret_cast<const bool*>(identity_as_first_hash->second.data());
7c673cae 126
1e59de90
TL
127 auto use_module_hash =
128 user_props.find(CuckooTablePropertyNames::kUseModuleHash);
7c673cae
FG
129 if (use_module_hash == user_props.end()) {
130 status_ = Status::Corruption("hash type is not found");
131 return;
132 }
1e59de90
TL
133 use_module_hash_ =
134 *reinterpret_cast<const bool*>(use_module_hash->second.data());
135 auto cuckoo_block_size =
136 user_props.find(CuckooTablePropertyNames::kCuckooBlockSize);
7c673cae
FG
137 if (cuckoo_block_size == user_props.end()) {
138 status_ = Status::Corruption("Cuckoo block size not found");
139 return;
140 }
1e59de90
TL
141 cuckoo_block_size_ =
142 *reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data());
7c673cae 143 cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
1e59de90
TL
144 // TODO: rate limit reads of whole cuckoo tables.
145 status_ =
146 file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), &file_data_,
147 nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
7c673cae
FG
148}
149
11fdf7f2
TL
150Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
151 const Slice& key, GetContext* get_context,
152 const SliceTransform* /* prefix_extractor */,
153 bool /*skip_filters*/) {
7c673cae
FG
154 assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
155 Slice user_key = ExtractUserKey(key);
156 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
1e59de90
TL
157 uint64_t offset =
158 bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_,
159 table_size_, identity_as_first_hash_,
160 get_slice_hash_);
7c673cae
FG
161 const char* bucket = &file_data_.data()[offset];
162 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
163 ++block_idx, bucket += bucket_length_) {
164 if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
165 Slice(bucket, user_key.size()))) {
166 return Status::OK();
167 }
168 // Here, we compare only the user key part as we support only one entry
169 // per user key and we don't support snapshot.
170 if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
171 Slice value(bucket + key_length_, value_length_);
172 if (is_last_level_) {
173 // Sequence number is not stored at the last level, so we will use
174 // kMaxSequenceNumber since it is unknown. This could cause some
175 // transactions to fail to lock a key due to known sequence number.
176 // However, it is expected for anyone to use a CuckooTable in a
177 // TransactionDB.
178 get_context->SaveValue(value, kMaxSequenceNumber);
179 } else {
180 Slice full_key(bucket, key_length_);
181 ParsedInternalKey found_ikey;
20effc67
TL
182 Status s = ParseInternalKey(full_key, &found_ikey,
183 false /* log_err_key */); // TODO
184 if (!s.ok()) return s;
11fdf7f2
TL
185 bool dont_care __attribute__((__unused__));
186 get_context->SaveValue(found_ikey, value, &dont_care);
7c673cae
FG
187 }
188 // We don't support merge operations. So, we return here.
189 return Status::OK();
190 }
191 }
192 }
193 return Status::OK();
194}
195
196void CuckooTableReader::Prepare(const Slice& key) {
197 // Prefetch the first Cuckoo Block.
198 Slice user_key = ExtractUserKey(key);
1e59de90
TL
199 uint64_t addr =
200 reinterpret_cast<uint64_t>(file_data_.data()) +
201 bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
202 identity_as_first_hash_, nullptr);
7c673cae
FG
203 uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
204 for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
205 PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
206 }
207}
208
209class CuckooTableIterator : public InternalIterator {
210 public:
211 explicit CuckooTableIterator(CuckooTableReader* reader);
f67539c2
TL
212 // No copying allowed
213 CuckooTableIterator(const CuckooTableIterator&) = delete;
214 void operator=(const Iterator&) = delete;
494da23a 215 ~CuckooTableIterator() override {}
7c673cae
FG
216 bool Valid() const override;
217 void SeekToFirst() override;
218 void SeekToLast() override;
219 void Seek(const Slice& target) override;
220 void SeekForPrev(const Slice& target) override;
221 void Next() override;
222 void Prev() override;
223 Slice key() const override;
224 Slice value() const override;
11fdf7f2 225 Status status() const override { return Status::OK(); }
7c673cae
FG
226 void InitIfNeeded();
227
228 private:
229 struct BucketComparator {
230 BucketComparator(const Slice& file_data, const Comparator* ucomp,
231 uint32_t bucket_len, uint32_t user_key_len,
232 const Slice& target = Slice())
1e59de90
TL
233 : file_data_(file_data),
234 ucomp_(ucomp),
235 bucket_len_(bucket_len),
236 user_key_len_(user_key_len),
237 target_(target) {}
7c673cae 238 bool operator()(const uint32_t first, const uint32_t second) const {
1e59de90
TL
239 const char* first_bucket = (first == kInvalidIndex)
240 ? target_.data()
241 : &file_data_.data()[first * bucket_len_];
7c673cae 242 const char* second_bucket =
1e59de90
TL
243 (second == kInvalidIndex) ? target_.data()
244 : &file_data_.data()[second * bucket_len_];
7c673cae
FG
245 return ucomp_->Compare(Slice(first_bucket, user_key_len_),
246 Slice(second_bucket, user_key_len_)) < 0;
247 }
1e59de90 248
7c673cae
FG
249 private:
250 const Slice file_data_;
251 const Comparator* ucomp_;
252 const uint32_t bucket_len_;
253 const uint32_t user_key_len_;
254 const Slice target_;
255 };
256
257 const BucketComparator bucket_comparator_;
258 void PrepareKVAtCurrIdx();
259 CuckooTableReader* reader_;
260 bool initialized_;
7c673cae
FG
261 // Contains a map of keys to bucket_id sorted in key order.
262 std::vector<uint32_t> sorted_bucket_ids_;
263 // We assume that the number of items can be stored in uint32 (4 Billion).
264 uint32_t curr_key_idx_;
265 Slice curr_value_;
266 IterKey curr_key_;
7c673cae
FG
267};
268
269CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
1e59de90
TL
270 : bucket_comparator_(reader->file_data_, reader->ucomp_,
271 reader->bucket_length_, reader->user_key_length_),
272 reader_(reader),
273 initialized_(false),
274 curr_key_idx_(kInvalidIndex) {
7c673cae
FG
275 sorted_bucket_ids_.clear();
276 curr_value_.clear();
277 curr_key_.Clear();
278}
279
280void CuckooTableIterator::InitIfNeeded() {
281 if (initialized_) {
282 return;
283 }
1e59de90
TL
284 sorted_bucket_ids_.reserve(
285 static_cast<size_t>(reader_->GetTableProperties()->num_entries));
7c673cae
FG
286 uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
287 assert(num_buckets < kInvalidIndex);
288 const char* bucket = reader_->file_data_.data();
289 for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
290 if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
291 sorted_bucket_ids_.push_back(bucket_id);
292 }
293 bucket += reader_->bucket_length_;
294 }
295 assert(sorted_bucket_ids_.size() ==
1e59de90 296 reader_->GetTableProperties()->num_entries);
7c673cae
FG
297 std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
298 bucket_comparator_);
299 curr_key_idx_ = kInvalidIndex;
300 initialized_ = true;
301}
302
303void CuckooTableIterator::SeekToFirst() {
304 InitIfNeeded();
305 curr_key_idx_ = 0;
306 PrepareKVAtCurrIdx();
307}
308
309void CuckooTableIterator::SeekToLast() {
310 InitIfNeeded();
311 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
312 PrepareKVAtCurrIdx();
313}
314
315void CuckooTableIterator::Seek(const Slice& target) {
316 InitIfNeeded();
317 const BucketComparator seek_comparator(
1e59de90
TL
318 reader_->file_data_, reader_->ucomp_, reader_->bucket_length_,
319 reader_->user_key_length_, ExtractUserKey(target));
320 auto seek_it =
321 std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
322 kInvalidIndex, seek_comparator);
7c673cae
FG
323 curr_key_idx_ =
324 static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
325 PrepareKVAtCurrIdx();
326}
327
11fdf7f2 328void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
7c673cae
FG
329 // Not supported
330 assert(false);
331}
332
333bool CuckooTableIterator::Valid() const {
334 return curr_key_idx_ < sorted_bucket_ids_.size();
335}
336
337void CuckooTableIterator::PrepareKVAtCurrIdx() {
338 if (!Valid()) {
339 curr_value_.clear();
340 curr_key_.Clear();
341 return;
342 }
343 uint32_t id = sorted_bucket_ids_[curr_key_idx_];
1e59de90
TL
344 const char* offset =
345 reader_->file_data_.data() + id * reader_->bucket_length_;
7c673cae
FG
346 if (reader_->is_last_level_) {
347 // Always return internal key.
1e59de90
TL
348 curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0,
349 kTypeValue);
7c673cae
FG
350 } else {
351 curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
352 }
353 curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
354}
355
356void CuckooTableIterator::Next() {
357 if (!Valid()) {
358 curr_value_.clear();
359 curr_key_.Clear();
360 return;
361 }
362 ++curr_key_idx_;
363 PrepareKVAtCurrIdx();
364}
365
366void CuckooTableIterator::Prev() {
367 if (curr_key_idx_ == 0) {
368 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
369 }
370 if (!Valid()) {
371 curr_value_.clear();
372 curr_key_.Clear();
373 return;
374 }
375 --curr_key_idx_;
376 PrepareKVAtCurrIdx();
377}
378
379Slice CuckooTableIterator::key() const {
380 assert(Valid());
381 return curr_key_.GetInternalKey();
382}
383
384Slice CuckooTableIterator::value() const {
385 assert(Valid());
386 return curr_value_;
387}
388
7c673cae 389InternalIterator* CuckooTableReader::NewIterator(
11fdf7f2
TL
390 const ReadOptions& /*read_options*/,
391 const SliceTransform* /* prefix_extractor */, Arena* arena,
f67539c2 392 bool /*skip_filters*/, TableReaderCaller /*caller*/,
1e59de90 393 size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
7c673cae 394 if (!status().ok()) {
11fdf7f2 395 return NewErrorInternalIterator<Slice>(
7c673cae
FG
396 Status::Corruption("CuckooTableReader status is not okay."), arena);
397 }
7c673cae
FG
398 CuckooTableIterator* iter;
399 if (arena == nullptr) {
400 iter = new CuckooTableIterator(this);
401 } else {
402 auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
403 iter = new (iter_mem) CuckooTableIterator(this);
404 }
405 return iter;
406}
407
408size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
409
f67539c2 410} // namespace ROCKSDB_NAMESPACE
7c673cae 411#endif