]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / cuckoo / cuckoo_table_builder.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #ifndef ROCKSDB_LITE
7 #include "table/cuckoo/cuckoo_table_builder.h"
8
9 #include <assert.h>
10
11 #include <algorithm>
12 #include <limits>
13 #include <string>
14 #include <vector>
15
16 #include "db/dbformat.h"
17 #include "file/writable_file_writer.h"
18 #include "rocksdb/env.h"
19 #include "rocksdb/table.h"
20 #include "table/block_based/block_builder.h"
21 #include "table/cuckoo/cuckoo_table_factory.h"
22 #include "table/format.h"
23 #include "table/meta_blocks.h"
24 #include "util/autovector.h"
25 #include "util/random.h"
26 #include "util/string_util.h"
27
28 namespace ROCKSDB_NAMESPACE {
29 const std::string CuckooTablePropertyNames::kEmptyKey =
30 "rocksdb.cuckoo.bucket.empty.key";
31 const std::string CuckooTablePropertyNames::kNumHashFunc =
32 "rocksdb.cuckoo.hash.num";
33 const std::string CuckooTablePropertyNames::kHashTableSize =
34 "rocksdb.cuckoo.hash.size";
35 const std::string CuckooTablePropertyNames::kValueLength =
36 "rocksdb.cuckoo.value.length";
37 const std::string CuckooTablePropertyNames::kIsLastLevel =
38 "rocksdb.cuckoo.file.islastlevel";
39 const std::string CuckooTablePropertyNames::kCuckooBlockSize =
40 "rocksdb.cuckoo.hash.cuckooblocksize";
41 const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
42 "rocksdb.cuckoo.hash.identityfirst";
43 const std::string CuckooTablePropertyNames::kUseModuleHash =
44 "rocksdb.cuckoo.hash.usemodule";
45 const std::string CuckooTablePropertyNames::kUserKeyLength =
46 "rocksdb.cuckoo.hash.userkeylength";
47
48 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
49 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
50
51 CuckooTableBuilder::CuckooTableBuilder(
52 WritableFileWriter* file, double max_hash_table_ratio,
53 uint32_t max_num_hash_table, uint32_t max_search_depth,
54 const Comparator* user_comparator, uint32_t cuckoo_block_size,
55 bool use_module_hash, bool identity_as_first_hash,
56 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
57 uint32_t column_family_id, const std::string& column_family_name,
58 const std::string& db_id, const std::string& db_session_id,
59 uint64_t file_number)
60 : num_hash_func_(2),
61 file_(file),
62 max_hash_table_ratio_(max_hash_table_ratio),
63 max_num_hash_func_(max_num_hash_table),
64 max_search_depth_(max_search_depth),
65 cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
66 hash_table_size_(use_module_hash ? 0 : 2),
67 is_last_level_file_(false),
68 has_seen_first_key_(false),
69 has_seen_first_value_(false),
70 key_size_(0),
71 value_size_(0),
72 num_entries_(0),
73 num_values_(0),
74 ucomp_(user_comparator),
75 use_module_hash_(use_module_hash),
76 identity_as_first_hash_(identity_as_first_hash),
77 get_slice_hash_(get_slice_hash),
78 closed_(false) {
79 // Data is in a huge block.
80 properties_.num_data_blocks = 1;
81 properties_.index_size = 0;
82 properties_.filter_size = 0;
83 properties_.column_family_id = column_family_id;
84 properties_.column_family_name = column_family_name;
85 properties_.db_id = db_id;
86 properties_.db_session_id = db_session_id;
87 properties_.orig_file_number = file_number;
88 status_.PermitUncheckedError();
89 io_status_.PermitUncheckedError();
90 }
91
92 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
93 if (num_entries_ >= kMaxVectorIdx - 1) {
94 status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
95 return;
96 }
97 ParsedInternalKey ikey;
98 Status pik_status =
99 ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO
100 if (!pik_status.ok()) {
101 status_ = Status::Corruption("Unable to parse key into internal key. ",
102 pik_status.getState());
103 return;
104 }
105 if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
106 status_ = Status::NotSupported("Unsupported key type " +
107 std::to_string(ikey.type));
108 return;
109 }
110
111 // Determine if we can ignore the sequence number and value type from
112 // internal keys by looking at sequence number from first key. We assume
113 // that if first key has a zero sequence number, then all the remaining
114 // keys will have zero seq. no.
115 if (!has_seen_first_key_) {
116 is_last_level_file_ = ikey.sequence == 0;
117 has_seen_first_key_ = true;
118 smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
119 largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
120 key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
121 }
122 if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
123 status_ = Status::NotSupported("all keys have to be the same size");
124 return;
125 }
126
127 if (ikey.type == kTypeValue) {
128 if (!has_seen_first_value_) {
129 has_seen_first_value_ = true;
130 value_size_ = value.size();
131 }
132 if (value_size_ != value.size()) {
133 status_ = Status::NotSupported("all values have to be the same size");
134 return;
135 }
136
137 if (is_last_level_file_) {
138 kvs_.append(ikey.user_key.data(), ikey.user_key.size());
139 } else {
140 kvs_.append(key.data(), key.size());
141 }
142 kvs_.append(value.data(), value.size());
143 ++num_values_;
144 } else {
145 if (is_last_level_file_) {
146 deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
147 } else {
148 deleted_keys_.append(key.data(), key.size());
149 }
150 }
151 ++num_entries_;
152
153 // In order to fill the empty buckets in the hash table, we identify a
154 // key which is not used so far (unused_user_key). We determine this by
155 // maintaining smallest and largest keys inserted so far in bytewise order
156 // and use them to find a key outside this range in Finish() operation.
157 // Note that this strategy is independent of user comparator used here.
158 if (ikey.user_key.compare(smallest_user_key_) < 0) {
159 smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
160 } else if (ikey.user_key.compare(largest_user_key_) > 0) {
161 largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
162 }
163 if (!use_module_hash_) {
164 if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
165 hash_table_size_ *= 2;
166 }
167 }
168 }
169
170 bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
171 assert(closed_);
172 return idx >= num_values_;
173 }
174
175 Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
176 assert(closed_);
177 if (IsDeletedKey(idx)) {
178 return Slice(
179 &deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)],
180 static_cast<size_t>(key_size_));
181 }
182 return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))],
183 static_cast<size_t>(key_size_));
184 }
185
186 Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
187 assert(closed_);
188 return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
189 }
190
191 Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
192 assert(closed_);
193 if (IsDeletedKey(idx)) {
194 static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
195 return Slice(empty_value);
196 }
197 return Slice(
198 &kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)],
199 static_cast<size_t>(value_size_));
200 }
201
202 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
203 buckets->resize(
204 static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
205 uint32_t make_space_for_key_call_id = 0;
206 for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
207 uint64_t bucket_id = 0;
208 bool bucket_found = false;
209 autovector<uint64_t> hash_vals;
210 Slice user_key = GetUserKey(vector_idx);
211 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
212 ++hash_cnt) {
213 uint64_t hash_val =
214 CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_,
215 identity_as_first_hash_, get_slice_hash_);
216 // If there is a collision, check next cuckoo_block_size_ locations for
217 // empty locations. While checking, if we reach end of the hash table,
218 // stop searching and proceed for next hash function.
219 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
220 ++block_idx, ++hash_val) {
221 if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
222 kMaxVectorIdx) {
223 bucket_id = hash_val;
224 bucket_found = true;
225 break;
226 } else {
227 if (ucomp_->Compare(
228 user_key, GetUserKey((*buckets)[static_cast<size_t>(hash_val)]
229 .vector_idx)) == 0) {
230 return Status::NotSupported("Same key is being inserted again.");
231 }
232 hash_vals.push_back(hash_val);
233 }
234 }
235 }
236 while (!bucket_found &&
237 !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets,
238 &bucket_id)) {
239 // Rehash by increashing number of hash tables.
240 if (num_hash_func_ >= max_num_hash_func_) {
241 return Status::NotSupported("Too many collisions. Unable to hash.");
242 }
243 // We don't really need to rehash the entire table because old hashes are
244 // still valid and we only increased the number of hash functions.
245 uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
246 hash_table_size_, identity_as_first_hash_,
247 get_slice_hash_);
248 ++num_hash_func_;
249 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
250 ++block_idx, ++hash_val) {
251 if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
252 kMaxVectorIdx) {
253 bucket_found = true;
254 bucket_id = hash_val;
255 break;
256 } else {
257 hash_vals.push_back(hash_val);
258 }
259 }
260 }
261 (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
262 }
263 return Status::OK();
264 }
265
266 Status CuckooTableBuilder::Finish() {
267 assert(!closed_);
268 closed_ = true;
269 std::vector<CuckooBucket> buckets;
270 std::string unused_bucket;
271 if (num_entries_ > 0) {
272 // Calculate the real hash size if module hash is enabled.
273 if (use_module_hash_) {
274 hash_table_size_ =
275 static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
276 }
277 status_ = MakeHashTable(&buckets);
278 if (!status_.ok()) {
279 return status_;
280 }
281 // Determine unused_user_key to fill empty buckets.
282 std::string unused_user_key = smallest_user_key_;
283 int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
284 while (curr_pos >= 0) {
285 --unused_user_key[curr_pos];
286 if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
287 break;
288 }
289 --curr_pos;
290 }
291 if (curr_pos < 0) {
292 // Try using the largest key to identify an unused key.
293 unused_user_key = largest_user_key_;
294 curr_pos = static_cast<int>(unused_user_key.size()) - 1;
295 while (curr_pos >= 0) {
296 ++unused_user_key[curr_pos];
297 if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
298 break;
299 }
300 --curr_pos;
301 }
302 }
303 if (curr_pos < 0) {
304 return Status::Corruption("Unable to find unused key");
305 }
306 if (is_last_level_file_) {
307 unused_bucket = unused_user_key;
308 } else {
309 ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
310 AppendInternalKey(&unused_bucket, ikey);
311 }
312 }
313 properties_.num_entries = num_entries_;
314 properties_.num_deletions = num_entries_ - num_values_;
315 properties_.fixed_key_len = key_size_;
316 properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength]
317 .assign(reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
318
319 uint64_t bucket_size = key_size_ + value_size_;
320 unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
321 // Write the table.
322 uint32_t num_added = 0;
323 for (auto& bucket : buckets) {
324 if (bucket.vector_idx == kMaxVectorIdx) {
325 io_status_ = file_->Append(Slice(unused_bucket));
326 } else {
327 ++num_added;
328 io_status_ = file_->Append(GetKey(bucket.vector_idx));
329 if (io_status_.ok()) {
330 if (value_size_ > 0) {
331 io_status_ = file_->Append(GetValue(bucket.vector_idx));
332 }
333 }
334 }
335 if (!io_status_.ok()) {
336 status_ = io_status_;
337 return status_;
338 }
339 }
340 assert(num_added == NumEntries());
341 properties_.raw_key_size = num_added * properties_.fixed_key_len;
342 properties_.raw_value_size = num_added * value_size_;
343
344 uint64_t offset = buckets.size() * bucket_size;
345 properties_.data_size = offset;
346 unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
347 properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] =
348 unused_bucket;
349 properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
350 .assign(reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
351
352 properties_
353 .user_collected_properties[CuckooTablePropertyNames::kHashTableSize]
354 .assign(reinterpret_cast<const char*>(&hash_table_size_),
355 sizeof(hash_table_size_));
356 properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
357 .assign(reinterpret_cast<const char*>(&is_last_level_file_),
358 sizeof(is_last_level_file_));
359 properties_
360 .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize]
361 .assign(reinterpret_cast<const char*>(&cuckoo_block_size_),
362 sizeof(cuckoo_block_size_));
363 properties_
364 .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash]
365 .assign(reinterpret_cast<const char*>(&identity_as_first_hash_),
366 sizeof(identity_as_first_hash_));
367 properties_
368 .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash]
369 .assign(reinterpret_cast<const char*>(&use_module_hash_),
370 sizeof(use_module_hash_));
371 uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
372 properties_
373 .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength]
374 .assign(reinterpret_cast<const char*>(&user_key_len),
375 sizeof(user_key_len));
376
377 // Write meta blocks.
378 MetaIndexBuilder meta_index_builder;
379 PropertyBlockBuilder property_block_builder;
380
381 property_block_builder.AddTableProperty(properties_);
382 property_block_builder.Add(properties_.user_collected_properties);
383 Slice property_block = property_block_builder.Finish();
384 BlockHandle property_block_handle;
385 property_block_handle.set_offset(offset);
386 property_block_handle.set_size(property_block.size());
387 io_status_ = file_->Append(property_block);
388 offset += property_block.size();
389 if (!io_status_.ok()) {
390 status_ = io_status_;
391 return status_;
392 }
393
394 meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
395 Slice meta_index_block = meta_index_builder.Finish();
396
397 BlockHandle meta_index_block_handle;
398 meta_index_block_handle.set_offset(offset);
399 meta_index_block_handle.set_size(meta_index_block.size());
400 io_status_ = file_->Append(meta_index_block);
401 if (!io_status_.ok()) {
402 status_ = io_status_;
403 return status_;
404 }
405
406 FooterBuilder footer;
407 footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
408 kNoChecksum, meta_index_block_handle);
409 io_status_ = file_->Append(footer.GetSlice());
410 status_ = io_status_;
411 return status_;
412 }
413
414 void CuckooTableBuilder::Abandon() {
415 assert(!closed_);
416 closed_ = true;
417 }
418
419 uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; }
420
421 uint64_t CuckooTableBuilder::FileSize() const {
422 if (closed_) {
423 return file_->GetFileSize();
424 } else if (num_entries_ == 0) {
425 return 0;
426 }
427
428 if (use_module_hash_) {
429 return static_cast<uint64_t>((key_size_ + value_size_) * num_entries_ /
430 max_hash_table_ratio_);
431 } else {
432 // Account for buckets being a power of two.
433 // As elements are added, file size remains constant for a while and
434 // doubles its size. Since compaction algorithm stops adding elements
435 // only after it exceeds the file limit, we account for the extra element
436 // being added here.
437 uint64_t expected_hash_table_size = hash_table_size_;
438 if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
439 expected_hash_table_size *= 2;
440 }
441 return (key_size_ + value_size_) * expected_hash_table_size - 1;
442 }
443 }
444
445 // This method is invoked when there is no place to insert the target key.
446 // It searches for a set of elements that can be moved to accommodate target
447 // key. The search is a BFS graph traversal with first level (hash_vals)
448 // being all the buckets target key could go to.
449 // Then, from each node (curr_node), we find all the buckets that curr_node
450 // could go to. They form the children of curr_node in the tree.
451 // We continue the traversal until we find an empty bucket, in which case, we
452 // move all elements along the path from first level to this empty bucket, to
453 // make space for target key which is inserted at first level (*bucket_id).
454 // If tree depth exceedes max depth, we return false indicating failure.
455 bool CuckooTableBuilder::MakeSpaceForKey(
456 const autovector<uint64_t>& hash_vals,
457 const uint32_t make_space_for_key_call_id,
458 std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
459 struct CuckooNode {
460 uint64_t bucket_id;
461 uint32_t depth;
462 uint32_t parent_pos;
463 CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
464 : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
465 };
466 // This is BFS search tree that is stored simply as a vector.
467 // Each node stores the index of parent node in the vector.
468 std::vector<CuckooNode> tree;
469 // We want to identify already visited buckets in the current method call so
470 // that we don't add same buckets again for exploration in the tree.
471 // We do this by maintaining a count of current method call in
472 // make_space_for_key_call_id, which acts as a unique id for this invocation
473 // of the method. We store this number into the nodes that we explore in
474 // current method call.
475 // It is unlikely for the increment operation to overflow because the maximum
476 // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
477 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
478 uint64_t bid = hash_vals[hash_cnt];
479 (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id =
480 make_space_for_key_call_id;
481 tree.push_back(CuckooNode(bid, 0, 0));
482 }
483 bool null_found = false;
484 uint32_t curr_pos = 0;
485 while (!null_found && curr_pos < tree.size()) {
486 CuckooNode& curr_node = tree[curr_pos];
487 uint32_t curr_depth = curr_node.depth;
488 if (curr_depth >= max_search_depth_) {
489 break;
490 }
491 CuckooBucket& curr_bucket =
492 (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
493 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found;
494 ++hash_cnt) {
495 uint64_t child_bucket_id = CuckooHash(
496 GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_,
497 hash_table_size_, identity_as_first_hash_, get_slice_hash_);
498 // Iterate inside Cuckoo Block.
499 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
500 ++block_idx, ++child_bucket_id) {
501 if ((*buckets)[static_cast<size_t>(child_bucket_id)]
502 .make_space_for_key_call_id == make_space_for_key_call_id) {
503 continue;
504 }
505 (*buckets)[static_cast<size_t>(child_bucket_id)]
506 .make_space_for_key_call_id = make_space_for_key_call_id;
507 tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos));
508 if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx ==
509 kMaxVectorIdx) {
510 null_found = true;
511 break;
512 }
513 }
514 }
515 ++curr_pos;
516 }
517
518 if (null_found) {
519 // There is an empty node in tree.back(). Now, traverse the path from this
520 // empty node to top of the tree and at every node in the path, replace
521 // child with the parent. Stop when first level is reached in the tree
522 // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
523 // this location in first level for target key to be inserted.
524 uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
525 while (bucket_to_replace_pos >= num_hash_func_) {
526 CuckooNode& curr_node = tree[bucket_to_replace_pos];
527 (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
528 (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
529 bucket_to_replace_pos = curr_node.parent_pos;
530 }
531 *bucket_id = tree[bucket_to_replace_pos].bucket_id;
532 }
533 return null_found;
534 }
535
536 std::string CuckooTableBuilder::GetFileChecksum() const {
537 if (file_ != nullptr) {
538 return file_->GetFileChecksum();
539 } else {
540 return kUnknownFileChecksum;
541 }
542 }
543
544 const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
545 if (file_ != nullptr) {
546 return file_->GetFileChecksumFuncName();
547 } else {
548 return kUnknownFileChecksumFuncName;
549 }
550 }
551
552 } // namespace ROCKSDB_NAMESPACE
553 #endif // ROCKSDB_LITE