]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/cuckoo_table_builder.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / cuckoo_table_builder.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#ifndef ROCKSDB_LITE
7#include "table/cuckoo_table_builder.h"
8
9#include <assert.h>
10#include <algorithm>
11#include <limits>
12#include <string>
13#include <vector>
14
15#include "db/dbformat.h"
16#include "rocksdb/env.h"
17#include "rocksdb/table.h"
18#include "table/block_builder.h"
19#include "table/cuckoo_table_factory.h"
20#include "table/format.h"
21#include "table/meta_blocks.h"
22#include "util/autovector.h"
23#include "util/file_reader_writer.h"
24#include "util/random.h"
25#include "util/string_util.h"
26
27namespace rocksdb {
28const std::string CuckooTablePropertyNames::kEmptyKey =
29 "rocksdb.cuckoo.bucket.empty.key";
30const std::string CuckooTablePropertyNames::kNumHashFunc =
31 "rocksdb.cuckoo.hash.num";
32const std::string CuckooTablePropertyNames::kHashTableSize =
33 "rocksdb.cuckoo.hash.size";
34const std::string CuckooTablePropertyNames::kValueLength =
35 "rocksdb.cuckoo.value.length";
36const std::string CuckooTablePropertyNames::kIsLastLevel =
37 "rocksdb.cuckoo.file.islastlevel";
38const std::string CuckooTablePropertyNames::kCuckooBlockSize =
39 "rocksdb.cuckoo.hash.cuckooblocksize";
40const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
41 "rocksdb.cuckoo.hash.identityfirst";
42const std::string CuckooTablePropertyNames::kUseModuleHash =
43 "rocksdb.cuckoo.hash.usemodule";
44const std::string CuckooTablePropertyNames::kUserKeyLength =
45 "rocksdb.cuckoo.hash.userkeylength";
46
47// Obtained by running echo rocksdb.table.cuckoo | sha1sum
48extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
49
50CuckooTableBuilder::CuckooTableBuilder(
51 WritableFileWriter* file, double max_hash_table_ratio,
52 uint32_t max_num_hash_table, uint32_t max_search_depth,
53 const Comparator* user_comparator, uint32_t cuckoo_block_size,
54 bool use_module_hash, bool identity_as_first_hash,
55 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
56 uint32_t column_family_id, const std::string& column_family_name)
57 : num_hash_func_(2),
58 file_(file),
59 max_hash_table_ratio_(max_hash_table_ratio),
60 max_num_hash_func_(max_num_hash_table),
61 max_search_depth_(max_search_depth),
62 cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
63 hash_table_size_(use_module_hash ? 0 : 2),
64 is_last_level_file_(false),
65 has_seen_first_key_(false),
66 has_seen_first_value_(false),
67 key_size_(0),
68 value_size_(0),
69 num_entries_(0),
70 num_values_(0),
71 ucomp_(user_comparator),
72 use_module_hash_(use_module_hash),
73 identity_as_first_hash_(identity_as_first_hash),
74 get_slice_hash_(get_slice_hash),
75 closed_(false) {
76 // Data is in a huge block.
77 properties_.num_data_blocks = 1;
78 properties_.index_size = 0;
79 properties_.filter_size = 0;
80 properties_.column_family_id = column_family_id;
81 properties_.column_family_name = column_family_name;
82}
83
84void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
85 if (num_entries_ >= kMaxVectorIdx - 1) {
86 status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
87 return;
88 }
89 ParsedInternalKey ikey;
90 if (!ParseInternalKey(key, &ikey)) {
91 status_ = Status::Corruption("Unable to parse key into inernal key.");
92 return;
93 }
94 if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
95 status_ = Status::NotSupported("Unsupported key type " +
96 ToString(ikey.type));
97 return;
98 }
99
100 // Determine if we can ignore the sequence number and value type from
101 // internal keys by looking at sequence number from first key. We assume
102 // that if first key has a zero sequence number, then all the remaining
103 // keys will have zero seq. no.
104 if (!has_seen_first_key_) {
105 is_last_level_file_ = ikey.sequence == 0;
106 has_seen_first_key_ = true;
107 smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
108 largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
109 key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
110 }
111 if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
112 status_ = Status::NotSupported("all keys have to be the same size");
113 return;
114 }
115
116 if (ikey.type == kTypeValue) {
117 if (!has_seen_first_value_) {
118 has_seen_first_value_ = true;
119 value_size_ = value.size();
120 }
121 if (value_size_ != value.size()) {
122 status_ = Status::NotSupported("all values have to be the same size");
123 return;
124 }
125
126 if (is_last_level_file_) {
127 kvs_.append(ikey.user_key.data(), ikey.user_key.size());
128 } else {
129 kvs_.append(key.data(), key.size());
130 }
131 kvs_.append(value.data(), value.size());
132 ++num_values_;
133 } else {
134 if (is_last_level_file_) {
135 deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
136 } else {
137 deleted_keys_.append(key.data(), key.size());
138 }
139 }
140 ++num_entries_;
141
142 // In order to fill the empty buckets in the hash table, we identify a
143 // key which is not used so far (unused_user_key). We determine this by
144 // maintaining smallest and largest keys inserted so far in bytewise order
145 // and use them to find a key outside this range in Finish() operation.
146 // Note that this strategy is independent of user comparator used here.
147 if (ikey.user_key.compare(smallest_user_key_) < 0) {
148 smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
149 } else if (ikey.user_key.compare(largest_user_key_) > 0) {
150 largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
151 }
152 if (!use_module_hash_) {
153 if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
154 hash_table_size_ *= 2;
155 }
156 }
157}
158
159bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
160 assert(closed_);
161 return idx >= num_values_;
162}
163
164Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
165 assert(closed_);
166 if (IsDeletedKey(idx)) {
11fdf7f2 167 return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
7c673cae 168 }
11fdf7f2 169 return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
7c673cae
FG
170}
171
172Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
173 assert(closed_);
174 return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
175}
176
177Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
178 assert(closed_);
179 if (IsDeletedKey(idx)) {
11fdf7f2 180 static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
7c673cae
FG
181 return Slice(empty_value);
182 }
11fdf7f2 183 return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
7c673cae
FG
184}
185
186Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
11fdf7f2 187 buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
7c673cae
FG
188 uint32_t make_space_for_key_call_id = 0;
189 for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
11fdf7f2 190 uint64_t bucket_id = 0;
7c673cae
FG
191 bool bucket_found = false;
192 autovector<uint64_t> hash_vals;
193 Slice user_key = GetUserKey(vector_idx);
194 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
195 ++hash_cnt) {
196 uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
197 hash_table_size_, identity_as_first_hash_, get_slice_hash_);
198 // If there is a collision, check next cuckoo_block_size_ locations for
199 // empty locations. While checking, if we reach end of the hash table,
200 // stop searching and proceed for next hash function.
201 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
202 ++block_idx, ++hash_val) {
11fdf7f2 203 if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
7c673cae
FG
204 bucket_id = hash_val;
205 bucket_found = true;
206 break;
207 } else {
208 if (ucomp_->Compare(user_key,
11fdf7f2 209 GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
7c673cae
FG
210 return Status::NotSupported("Same key is being inserted again.");
211 }
212 hash_vals.push_back(hash_val);
213 }
214 }
215 }
216 while (!bucket_found && !MakeSpaceForKey(hash_vals,
217 ++make_space_for_key_call_id, buckets, &bucket_id)) {
218 // Rehash by increashing number of hash tables.
219 if (num_hash_func_ >= max_num_hash_func_) {
220 return Status::NotSupported("Too many collisions. Unable to hash.");
221 }
222 // We don't really need to rehash the entire table because old hashes are
223 // still valid and we only increased the number of hash functions.
224 uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
225 hash_table_size_, identity_as_first_hash_, get_slice_hash_);
226 ++num_hash_func_;
227 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
228 ++block_idx, ++hash_val) {
11fdf7f2 229 if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
7c673cae
FG
230 bucket_found = true;
231 bucket_id = hash_val;
232 break;
233 } else {
234 hash_vals.push_back(hash_val);
235 }
236 }
237 }
11fdf7f2 238 (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
7c673cae
FG
239 }
240 return Status::OK();
241}
242
243Status CuckooTableBuilder::Finish() {
244 assert(!closed_);
245 closed_ = true;
246 std::vector<CuckooBucket> buckets;
247 Status s;
248 std::string unused_bucket;
249 if (num_entries_ > 0) {
250 // Calculate the real hash size if module hash is enabled.
251 if (use_module_hash_) {
252 hash_table_size_ =
253 static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
254 }
255 s = MakeHashTable(&buckets);
256 if (!s.ok()) {
257 return s;
258 }
259 // Determine unused_user_key to fill empty buckets.
260 std::string unused_user_key = smallest_user_key_;
261 int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
262 while (curr_pos >= 0) {
263 --unused_user_key[curr_pos];
264 if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
265 break;
266 }
267 --curr_pos;
268 }
269 if (curr_pos < 0) {
270 // Try using the largest key to identify an unused key.
271 unused_user_key = largest_user_key_;
272 curr_pos = static_cast<int>(unused_user_key.size()) - 1;
273 while (curr_pos >= 0) {
274 ++unused_user_key[curr_pos];
275 if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
276 break;
277 }
278 --curr_pos;
279 }
280 }
281 if (curr_pos < 0) {
282 return Status::Corruption("Unable to find unused key");
283 }
284 if (is_last_level_file_) {
285 unused_bucket = unused_user_key;
286 } else {
287 ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
288 AppendInternalKey(&unused_bucket, ikey);
289 }
290 }
291 properties_.num_entries = num_entries_;
292 properties_.fixed_key_len = key_size_;
293 properties_.user_collected_properties[
294 CuckooTablePropertyNames::kValueLength].assign(
295 reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
296
297 uint64_t bucket_size = key_size_ + value_size_;
11fdf7f2 298 unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
7c673cae
FG
299 // Write the table.
300 uint32_t num_added = 0;
301 for (auto& bucket : buckets) {
302 if (bucket.vector_idx == kMaxVectorIdx) {
303 s = file_->Append(Slice(unused_bucket));
304 } else {
305 ++num_added;
306 s = file_->Append(GetKey(bucket.vector_idx));
307 if (s.ok()) {
308 if (value_size_ > 0) {
309 s = file_->Append(GetValue(bucket.vector_idx));
310 }
311 }
312 }
313 if (!s.ok()) {
314 return s;
315 }
316 }
317 assert(num_added == NumEntries());
318 properties_.raw_key_size = num_added * properties_.fixed_key_len;
319 properties_.raw_value_size = num_added * value_size_;
320
321 uint64_t offset = buckets.size() * bucket_size;
322 properties_.data_size = offset;
11fdf7f2 323 unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
7c673cae
FG
324 properties_.user_collected_properties[
325 CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
326 properties_.user_collected_properties[
327 CuckooTablePropertyNames::kNumHashFunc].assign(
328 reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
329
330 properties_.user_collected_properties[
331 CuckooTablePropertyNames::kHashTableSize].assign(
332 reinterpret_cast<const char*>(&hash_table_size_),
333 sizeof(hash_table_size_));
334 properties_.user_collected_properties[
335 CuckooTablePropertyNames::kIsLastLevel].assign(
336 reinterpret_cast<const char*>(&is_last_level_file_),
337 sizeof(is_last_level_file_));
338 properties_.user_collected_properties[
339 CuckooTablePropertyNames::kCuckooBlockSize].assign(
340 reinterpret_cast<const char*>(&cuckoo_block_size_),
341 sizeof(cuckoo_block_size_));
342 properties_.user_collected_properties[
343 CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
344 reinterpret_cast<const char*>(&identity_as_first_hash_),
345 sizeof(identity_as_first_hash_));
346 properties_.user_collected_properties[
347 CuckooTablePropertyNames::kUseModuleHash].assign(
348 reinterpret_cast<const char*>(&use_module_hash_),
349 sizeof(use_module_hash_));
350 uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
351 properties_.user_collected_properties[
352 CuckooTablePropertyNames::kUserKeyLength].assign(
353 reinterpret_cast<const char*>(&user_key_len),
354 sizeof(user_key_len));
355
356 // Write meta blocks.
357 MetaIndexBuilder meta_index_builder;
358 PropertyBlockBuilder property_block_builder;
359
360 property_block_builder.AddTableProperty(properties_);
361 property_block_builder.Add(properties_.user_collected_properties);
362 Slice property_block = property_block_builder.Finish();
363 BlockHandle property_block_handle;
364 property_block_handle.set_offset(offset);
365 property_block_handle.set_size(property_block.size());
366 s = file_->Append(property_block);
367 offset += property_block.size();
368 if (!s.ok()) {
369 return s;
370 }
371
372 meta_index_builder.Add(kPropertiesBlock, property_block_handle);
373 Slice meta_index_block = meta_index_builder.Finish();
374
375 BlockHandle meta_index_block_handle;
376 meta_index_block_handle.set_offset(offset);
377 meta_index_block_handle.set_size(meta_index_block.size());
378 s = file_->Append(meta_index_block);
379 if (!s.ok()) {
380 return s;
381 }
382
383 Footer footer(kCuckooTableMagicNumber, 1);
384 footer.set_metaindex_handle(meta_index_block_handle);
385 footer.set_index_handle(BlockHandle::NullBlockHandle());
386 std::string footer_encoding;
387 footer.EncodeTo(&footer_encoding);
388 s = file_->Append(footer_encoding);
389 return s;
390}
391
392void CuckooTableBuilder::Abandon() {
393 assert(!closed_);
394 closed_ = true;
395}
396
397uint64_t CuckooTableBuilder::NumEntries() const {
398 return num_entries_;
399}
400
401uint64_t CuckooTableBuilder::FileSize() const {
402 if (closed_) {
403 return file_->GetFileSize();
404 } else if (num_entries_ == 0) {
405 return 0;
406 }
407
408 if (use_module_hash_) {
409 return static_cast<uint64_t>((key_size_ + value_size_) *
410 num_entries_ / max_hash_table_ratio_);
411 } else {
412 // Account for buckets being a power of two.
413 // As elements are added, file size remains constant for a while and
414 // doubles its size. Since compaction algorithm stops adding elements
415 // only after it exceeds the file limit, we account for the extra element
416 // being added here.
417 uint64_t expected_hash_table_size = hash_table_size_;
418 if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
419 expected_hash_table_size *= 2;
420 }
421 return (key_size_ + value_size_) * expected_hash_table_size - 1;
422 }
423}
424
425// This method is invoked when there is no place to insert the target key.
426// It searches for a set of elements that can be moved to accommodate target
427// key. The search is a BFS graph traversal with first level (hash_vals)
428// being all the buckets target key could go to.
429// Then, from each node (curr_node), we find all the buckets that curr_node
430// could go to. They form the children of curr_node in the tree.
431// We continue the traversal until we find an empty bucket, in which case, we
432// move all elements along the path from first level to this empty bucket, to
433// make space for target key which is inserted at first level (*bucket_id).
434// If tree depth exceedes max depth, we return false indicating failure.
435bool CuckooTableBuilder::MakeSpaceForKey(
436 const autovector<uint64_t>& hash_vals,
437 const uint32_t make_space_for_key_call_id,
438 std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
439 struct CuckooNode {
440 uint64_t bucket_id;
441 uint32_t depth;
442 uint32_t parent_pos;
443 CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
444 : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
445 };
446 // This is BFS search tree that is stored simply as a vector.
447 // Each node stores the index of parent node in the vector.
448 std::vector<CuckooNode> tree;
449 // We want to identify already visited buckets in the current method call so
450 // that we don't add same buckets again for exploration in the tree.
451 // We do this by maintaining a count of current method call in
452 // make_space_for_key_call_id, which acts as a unique id for this invocation
453 // of the method. We store this number into the nodes that we explore in
454 // current method call.
455 // It is unlikely for the increment operation to overflow because the maximum
456 // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
457 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
458 uint64_t bid = hash_vals[hash_cnt];
11fdf7f2 459 (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
7c673cae
FG
460 tree.push_back(CuckooNode(bid, 0, 0));
461 }
462 bool null_found = false;
463 uint32_t curr_pos = 0;
464 while (!null_found && curr_pos < tree.size()) {
465 CuckooNode& curr_node = tree[curr_pos];
466 uint32_t curr_depth = curr_node.depth;
467 if (curr_depth >= max_search_depth_) {
468 break;
469 }
11fdf7f2 470 CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
7c673cae
FG
471 for (uint32_t hash_cnt = 0;
472 hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
473 uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
474 hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
475 get_slice_hash_);
476 // Iterate inside Cuckoo Block.
477 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
478 ++block_idx, ++child_bucket_id) {
11fdf7f2 479 if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
7c673cae
FG
480 make_space_for_key_call_id) {
481 continue;
482 }
11fdf7f2 483 (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
7c673cae
FG
484 make_space_for_key_call_id;
485 tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
486 curr_pos));
11fdf7f2 487 if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
7c673cae
FG
488 null_found = true;
489 break;
490 }
491 }
492 }
493 ++curr_pos;
494 }
495
496 if (null_found) {
497 // There is an empty node in tree.back(). Now, traverse the path from this
498 // empty node to top of the tree and at every node in the path, replace
499 // child with the parent. Stop when first level is reached in the tree
500 // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
501 // this location in first level for target key to be inserted.
502 uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
503 while (bucket_to_replace_pos >= num_hash_func_) {
504 CuckooNode& curr_node = tree[bucket_to_replace_pos];
11fdf7f2
TL
505 (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
506 (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
7c673cae
FG
507 bucket_to_replace_pos = curr_node.parent_pos;
508 }
509 *bucket_id = tree[bucket_to_replace_pos].bucket_id;
510 }
511 return null_found;
512}
513
514} // namespace rocksdb
515#endif // ROCKSDB_LITE