1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
7 #include "table/cuckoo/cuckoo_table_builder.h"
16 #include "db/dbformat.h"
17 #include "file/writable_file_writer.h"
18 #include "rocksdb/env.h"
19 #include "rocksdb/table.h"
20 #include "table/block_based/block_builder.h"
21 #include "table/cuckoo/cuckoo_table_factory.h"
22 #include "table/format.h"
23 #include "table/meta_blocks.h"
24 #include "util/autovector.h"
25 #include "util/random.h"
26 #include "util/string_util.h"
28 namespace ROCKSDB_NAMESPACE
{
29 const std::string
CuckooTablePropertyNames::kEmptyKey
=
30 "rocksdb.cuckoo.bucket.empty.key";
31 const std::string
CuckooTablePropertyNames::kNumHashFunc
=
32 "rocksdb.cuckoo.hash.num";
33 const std::string
CuckooTablePropertyNames::kHashTableSize
=
34 "rocksdb.cuckoo.hash.size";
35 const std::string
CuckooTablePropertyNames::kValueLength
=
36 "rocksdb.cuckoo.value.length";
37 const std::string
CuckooTablePropertyNames::kIsLastLevel
=
38 "rocksdb.cuckoo.file.islastlevel";
39 const std::string
CuckooTablePropertyNames::kCuckooBlockSize
=
40 "rocksdb.cuckoo.hash.cuckooblocksize";
41 const std::string
CuckooTablePropertyNames::kIdentityAsFirstHash
=
42 "rocksdb.cuckoo.hash.identityfirst";
43 const std::string
CuckooTablePropertyNames::kUseModuleHash
=
44 "rocksdb.cuckoo.hash.usemodule";
45 const std::string
CuckooTablePropertyNames::kUserKeyLength
=
46 "rocksdb.cuckoo.hash.userkeylength";
48 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
49 extern const uint64_t kCuckooTableMagicNumber
= 0x926789d0c5f17873ull
;
51 CuckooTableBuilder::CuckooTableBuilder(
52 WritableFileWriter
* file
, double max_hash_table_ratio
,
53 uint32_t max_num_hash_table
, uint32_t max_search_depth
,
54 const Comparator
* user_comparator
, uint32_t cuckoo_block_size
,
55 bool use_module_hash
, bool identity_as_first_hash
,
56 uint64_t (*get_slice_hash
)(const Slice
&, uint32_t, uint64_t),
57 uint32_t column_family_id
, const std::string
& column_family_name
,
58 const std::string
& db_id
, const std::string
& db_session_id
,
62 max_hash_table_ratio_(max_hash_table_ratio
),
63 max_num_hash_func_(max_num_hash_table
),
64 max_search_depth_(max_search_depth
),
65 cuckoo_block_size_(std::max(1U, cuckoo_block_size
)),
66 hash_table_size_(use_module_hash
? 0 : 2),
67 is_last_level_file_(false),
68 has_seen_first_key_(false),
69 has_seen_first_value_(false),
74 ucomp_(user_comparator
),
75 use_module_hash_(use_module_hash
),
76 identity_as_first_hash_(identity_as_first_hash
),
77 get_slice_hash_(get_slice_hash
),
79 // Data is in a huge block.
80 properties_
.num_data_blocks
= 1;
81 properties_
.index_size
= 0;
82 properties_
.filter_size
= 0;
83 properties_
.column_family_id
= column_family_id
;
84 properties_
.column_family_name
= column_family_name
;
85 properties_
.db_id
= db_id
;
86 properties_
.db_session_id
= db_session_id
;
87 properties_
.orig_file_number
= file_number
;
88 status_
.PermitUncheckedError();
89 io_status_
.PermitUncheckedError();
92 void CuckooTableBuilder::Add(const Slice
& key
, const Slice
& value
) {
93 if (num_entries_
>= kMaxVectorIdx
- 1) {
94 status_
= Status::NotSupported("Number of keys in a file must be < 2^32-1");
97 ParsedInternalKey ikey
;
99 ParseInternalKey(key
, &ikey
, false /* log_err_key */); // TODO
100 if (!pik_status
.ok()) {
101 status_
= Status::Corruption("Unable to parse key into internal key. ",
102 pik_status
.getState());
105 if (ikey
.type
!= kTypeDeletion
&& ikey
.type
!= kTypeValue
) {
106 status_
= Status::NotSupported("Unsupported key type " +
107 std::to_string(ikey
.type
));
111 // Determine if we can ignore the sequence number and value type from
112 // internal keys by looking at sequence number from first key. We assume
113 // that if first key has a zero sequence number, then all the remaining
114 // keys will have zero seq. no.
115 if (!has_seen_first_key_
) {
116 is_last_level_file_
= ikey
.sequence
== 0;
117 has_seen_first_key_
= true;
118 smallest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
119 largest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
120 key_size_
= is_last_level_file_
? ikey
.user_key
.size() : key
.size();
122 if (key_size_
!= (is_last_level_file_
? ikey
.user_key
.size() : key
.size())) {
123 status_
= Status::NotSupported("all keys have to be the same size");
127 if (ikey
.type
== kTypeValue
) {
128 if (!has_seen_first_value_
) {
129 has_seen_first_value_
= true;
130 value_size_
= value
.size();
132 if (value_size_
!= value
.size()) {
133 status_
= Status::NotSupported("all values have to be the same size");
137 if (is_last_level_file_
) {
138 kvs_
.append(ikey
.user_key
.data(), ikey
.user_key
.size());
140 kvs_
.append(key
.data(), key
.size());
142 kvs_
.append(value
.data(), value
.size());
145 if (is_last_level_file_
) {
146 deleted_keys_
.append(ikey
.user_key
.data(), ikey
.user_key
.size());
148 deleted_keys_
.append(key
.data(), key
.size());
153 // In order to fill the empty buckets in the hash table, we identify a
154 // key which is not used so far (unused_user_key). We determine this by
155 // maintaining smallest and largest keys inserted so far in bytewise order
156 // and use them to find a key outside this range in Finish() operation.
157 // Note that this strategy is independent of user comparator used here.
158 if (ikey
.user_key
.compare(smallest_user_key_
) < 0) {
159 smallest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
160 } else if (ikey
.user_key
.compare(largest_user_key_
) > 0) {
161 largest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
163 if (!use_module_hash_
) {
164 if (hash_table_size_
< num_entries_
/ max_hash_table_ratio_
) {
165 hash_table_size_
*= 2;
170 bool CuckooTableBuilder::IsDeletedKey(uint64_t idx
) const {
172 return idx
>= num_values_
;
175 Slice
CuckooTableBuilder::GetKey(uint64_t idx
) const {
177 if (IsDeletedKey(idx
)) {
179 &deleted_keys_
[static_cast<size_t>((idx
- num_values_
) * key_size_
)],
180 static_cast<size_t>(key_size_
));
182 return Slice(&kvs_
[static_cast<size_t>(idx
* (key_size_
+ value_size_
))],
183 static_cast<size_t>(key_size_
));
186 Slice
CuckooTableBuilder::GetUserKey(uint64_t idx
) const {
188 return is_last_level_file_
? GetKey(idx
) : ExtractUserKey(GetKey(idx
));
191 Slice
CuckooTableBuilder::GetValue(uint64_t idx
) const {
193 if (IsDeletedKey(idx
)) {
194 static std::string
empty_value(static_cast<unsigned int>(value_size_
), 'a');
195 return Slice(empty_value
);
198 &kvs_
[static_cast<size_t>(idx
* (key_size_
+ value_size_
) + key_size_
)],
199 static_cast<size_t>(value_size_
));
202 Status
CuckooTableBuilder::MakeHashTable(std::vector
<CuckooBucket
>* buckets
) {
204 static_cast<size_t>(hash_table_size_
+ cuckoo_block_size_
- 1));
205 uint32_t make_space_for_key_call_id
= 0;
206 for (uint32_t vector_idx
= 0; vector_idx
< num_entries_
; vector_idx
++) {
207 uint64_t bucket_id
= 0;
208 bool bucket_found
= false;
209 autovector
<uint64_t> hash_vals
;
210 Slice user_key
= GetUserKey(vector_idx
);
211 for (uint32_t hash_cnt
= 0; hash_cnt
< num_hash_func_
&& !bucket_found
;
214 CuckooHash(user_key
, hash_cnt
, use_module_hash_
, hash_table_size_
,
215 identity_as_first_hash_
, get_slice_hash_
);
216 // If there is a collision, check next cuckoo_block_size_ locations for
217 // empty locations. While checking, if we reach end of the hash table,
218 // stop searching and proceed for next hash function.
219 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
220 ++block_idx
, ++hash_val
) {
221 if ((*buckets
)[static_cast<size_t>(hash_val
)].vector_idx
==
223 bucket_id
= hash_val
;
228 user_key
, GetUserKey((*buckets
)[static_cast<size_t>(hash_val
)]
229 .vector_idx
)) == 0) {
230 return Status::NotSupported("Same key is being inserted again.");
232 hash_vals
.push_back(hash_val
);
236 while (!bucket_found
&&
237 !MakeSpaceForKey(hash_vals
, ++make_space_for_key_call_id
, buckets
,
239 // Rehash by increashing number of hash tables.
240 if (num_hash_func_
>= max_num_hash_func_
) {
241 return Status::NotSupported("Too many collisions. Unable to hash.");
243 // We don't really need to rehash the entire table because old hashes are
244 // still valid and we only increased the number of hash functions.
245 uint64_t hash_val
= CuckooHash(user_key
, num_hash_func_
, use_module_hash_
,
246 hash_table_size_
, identity_as_first_hash_
,
249 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
250 ++block_idx
, ++hash_val
) {
251 if ((*buckets
)[static_cast<size_t>(hash_val
)].vector_idx
==
254 bucket_id
= hash_val
;
257 hash_vals
.push_back(hash_val
);
261 (*buckets
)[static_cast<size_t>(bucket_id
)].vector_idx
= vector_idx
;
266 Status
CuckooTableBuilder::Finish() {
269 std::vector
<CuckooBucket
> buckets
;
270 std::string unused_bucket
;
271 if (num_entries_
> 0) {
272 // Calculate the real hash size if module hash is enabled.
273 if (use_module_hash_
) {
275 static_cast<uint64_t>(num_entries_
/ max_hash_table_ratio_
);
277 status_
= MakeHashTable(&buckets
);
281 // Determine unused_user_key to fill empty buckets.
282 std::string unused_user_key
= smallest_user_key_
;
283 int curr_pos
= static_cast<int>(unused_user_key
.size()) - 1;
284 while (curr_pos
>= 0) {
285 --unused_user_key
[curr_pos
];
286 if (Slice(unused_user_key
).compare(smallest_user_key_
) < 0) {
292 // Try using the largest key to identify an unused key.
293 unused_user_key
= largest_user_key_
;
294 curr_pos
= static_cast<int>(unused_user_key
.size()) - 1;
295 while (curr_pos
>= 0) {
296 ++unused_user_key
[curr_pos
];
297 if (Slice(unused_user_key
).compare(largest_user_key_
) > 0) {
304 return Status::Corruption("Unable to find unused key");
306 if (is_last_level_file_
) {
307 unused_bucket
= unused_user_key
;
309 ParsedInternalKey
ikey(unused_user_key
, 0, kTypeValue
);
310 AppendInternalKey(&unused_bucket
, ikey
);
313 properties_
.num_entries
= num_entries_
;
314 properties_
.num_deletions
= num_entries_
- num_values_
;
315 properties_
.fixed_key_len
= key_size_
;
316 properties_
.user_collected_properties
[CuckooTablePropertyNames::kValueLength
]
317 .assign(reinterpret_cast<const char*>(&value_size_
), sizeof(value_size_
));
319 uint64_t bucket_size
= key_size_
+ value_size_
;
320 unused_bucket
.resize(static_cast<size_t>(bucket_size
), 'a');
322 uint32_t num_added
= 0;
323 for (auto& bucket
: buckets
) {
324 if (bucket
.vector_idx
== kMaxVectorIdx
) {
325 io_status_
= file_
->Append(Slice(unused_bucket
));
328 io_status_
= file_
->Append(GetKey(bucket
.vector_idx
));
329 if (io_status_
.ok()) {
330 if (value_size_
> 0) {
331 io_status_
= file_
->Append(GetValue(bucket
.vector_idx
));
335 if (!io_status_
.ok()) {
336 status_
= io_status_
;
340 assert(num_added
== NumEntries());
341 properties_
.raw_key_size
= num_added
* properties_
.fixed_key_len
;
342 properties_
.raw_value_size
= num_added
* value_size_
;
344 uint64_t offset
= buckets
.size() * bucket_size
;
345 properties_
.data_size
= offset
;
346 unused_bucket
.resize(static_cast<size_t>(properties_
.fixed_key_len
));
347 properties_
.user_collected_properties
[CuckooTablePropertyNames::kEmptyKey
] =
349 properties_
.user_collected_properties
[CuckooTablePropertyNames::kNumHashFunc
]
350 .assign(reinterpret_cast<char*>(&num_hash_func_
), sizeof(num_hash_func_
));
353 .user_collected_properties
[CuckooTablePropertyNames::kHashTableSize
]
354 .assign(reinterpret_cast<const char*>(&hash_table_size_
),
355 sizeof(hash_table_size_
));
356 properties_
.user_collected_properties
[CuckooTablePropertyNames::kIsLastLevel
]
357 .assign(reinterpret_cast<const char*>(&is_last_level_file_
),
358 sizeof(is_last_level_file_
));
360 .user_collected_properties
[CuckooTablePropertyNames::kCuckooBlockSize
]
361 .assign(reinterpret_cast<const char*>(&cuckoo_block_size_
),
362 sizeof(cuckoo_block_size_
));
364 .user_collected_properties
[CuckooTablePropertyNames::kIdentityAsFirstHash
]
365 .assign(reinterpret_cast<const char*>(&identity_as_first_hash_
),
366 sizeof(identity_as_first_hash_
));
368 .user_collected_properties
[CuckooTablePropertyNames::kUseModuleHash
]
369 .assign(reinterpret_cast<const char*>(&use_module_hash_
),
370 sizeof(use_module_hash_
));
371 uint32_t user_key_len
= static_cast<uint32_t>(smallest_user_key_
.size());
373 .user_collected_properties
[CuckooTablePropertyNames::kUserKeyLength
]
374 .assign(reinterpret_cast<const char*>(&user_key_len
),
375 sizeof(user_key_len
));
377 // Write meta blocks.
378 MetaIndexBuilder meta_index_builder
;
379 PropertyBlockBuilder property_block_builder
;
381 property_block_builder
.AddTableProperty(properties_
);
382 property_block_builder
.Add(properties_
.user_collected_properties
);
383 Slice property_block
= property_block_builder
.Finish();
384 BlockHandle property_block_handle
;
385 property_block_handle
.set_offset(offset
);
386 property_block_handle
.set_size(property_block
.size());
387 io_status_
= file_
->Append(property_block
);
388 offset
+= property_block
.size();
389 if (!io_status_
.ok()) {
390 status_
= io_status_
;
394 meta_index_builder
.Add(kPropertiesBlockName
, property_block_handle
);
395 Slice meta_index_block
= meta_index_builder
.Finish();
397 BlockHandle meta_index_block_handle
;
398 meta_index_block_handle
.set_offset(offset
);
399 meta_index_block_handle
.set_size(meta_index_block
.size());
400 io_status_
= file_
->Append(meta_index_block
);
401 if (!io_status_
.ok()) {
402 status_
= io_status_
;
406 FooterBuilder footer
;
407 footer
.Build(kCuckooTableMagicNumber
, /* format_version */ 1, offset
,
408 kNoChecksum
, meta_index_block_handle
);
409 io_status_
= file_
->Append(footer
.GetSlice());
410 status_
= io_status_
;
414 void CuckooTableBuilder::Abandon() {
419 uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_
; }
421 uint64_t CuckooTableBuilder::FileSize() const {
423 return file_
->GetFileSize();
424 } else if (num_entries_
== 0) {
428 if (use_module_hash_
) {
429 return static_cast<uint64_t>((key_size_
+ value_size_
) * num_entries_
/
430 max_hash_table_ratio_
);
432 // Account for buckets being a power of two.
433 // As elements are added, file size remains constant for a while and
434 // doubles its size. Since compaction algorithm stops adding elements
435 // only after it exceeds the file limit, we account for the extra element
437 uint64_t expected_hash_table_size
= hash_table_size_
;
438 if (expected_hash_table_size
< (num_entries_
+ 1) / max_hash_table_ratio_
) {
439 expected_hash_table_size
*= 2;
441 return (key_size_
+ value_size_
) * expected_hash_table_size
- 1;
445 // This method is invoked when there is no place to insert the target key.
446 // It searches for a set of elements that can be moved to accommodate target
447 // key. The search is a BFS graph traversal with first level (hash_vals)
448 // being all the buckets target key could go to.
449 // Then, from each node (curr_node), we find all the buckets that curr_node
450 // could go to. They form the children of curr_node in the tree.
451 // We continue the traversal until we find an empty bucket, in which case, we
452 // move all elements along the path from first level to this empty bucket, to
453 // make space for target key which is inserted at first level (*bucket_id).
454 // If tree depth exceedes max depth, we return false indicating failure.
455 bool CuckooTableBuilder::MakeSpaceForKey(
456 const autovector
<uint64_t>& hash_vals
,
457 const uint32_t make_space_for_key_call_id
,
458 std::vector
<CuckooBucket
>* buckets
, uint64_t* bucket_id
) {
463 CuckooNode(uint64_t _bucket_id
, uint32_t _depth
, int _parent_pos
)
464 : bucket_id(_bucket_id
), depth(_depth
), parent_pos(_parent_pos
) {}
466 // This is BFS search tree that is stored simply as a vector.
467 // Each node stores the index of parent node in the vector.
468 std::vector
<CuckooNode
> tree
;
469 // We want to identify already visited buckets in the current method call so
470 // that we don't add same buckets again for exploration in the tree.
471 // We do this by maintaining a count of current method call in
472 // make_space_for_key_call_id, which acts as a unique id for this invocation
473 // of the method. We store this number into the nodes that we explore in
474 // current method call.
475 // It is unlikely for the increment operation to overflow because the maximum
476 // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
477 for (uint32_t hash_cnt
= 0; hash_cnt
< num_hash_func_
; ++hash_cnt
) {
478 uint64_t bid
= hash_vals
[hash_cnt
];
479 (*buckets
)[static_cast<size_t>(bid
)].make_space_for_key_call_id
=
480 make_space_for_key_call_id
;
481 tree
.push_back(CuckooNode(bid
, 0, 0));
483 bool null_found
= false;
484 uint32_t curr_pos
= 0;
485 while (!null_found
&& curr_pos
< tree
.size()) {
486 CuckooNode
& curr_node
= tree
[curr_pos
];
487 uint32_t curr_depth
= curr_node
.depth
;
488 if (curr_depth
>= max_search_depth_
) {
491 CuckooBucket
& curr_bucket
=
492 (*buckets
)[static_cast<size_t>(curr_node
.bucket_id
)];
493 for (uint32_t hash_cnt
= 0; hash_cnt
< num_hash_func_
&& !null_found
;
495 uint64_t child_bucket_id
= CuckooHash(
496 GetUserKey(curr_bucket
.vector_idx
), hash_cnt
, use_module_hash_
,
497 hash_table_size_
, identity_as_first_hash_
, get_slice_hash_
);
498 // Iterate inside Cuckoo Block.
499 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
500 ++block_idx
, ++child_bucket_id
) {
501 if ((*buckets
)[static_cast<size_t>(child_bucket_id
)]
502 .make_space_for_key_call_id
== make_space_for_key_call_id
) {
505 (*buckets
)[static_cast<size_t>(child_bucket_id
)]
506 .make_space_for_key_call_id
= make_space_for_key_call_id
;
507 tree
.push_back(CuckooNode(child_bucket_id
, curr_depth
+ 1, curr_pos
));
508 if ((*buckets
)[static_cast<size_t>(child_bucket_id
)].vector_idx
==
519 // There is an empty node in tree.back(). Now, traverse the path from this
520 // empty node to top of the tree and at every node in the path, replace
521 // child with the parent. Stop when first level is reached in the tree
522 // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
523 // this location in first level for target key to be inserted.
524 uint32_t bucket_to_replace_pos
= static_cast<uint32_t>(tree
.size()) - 1;
525 while (bucket_to_replace_pos
>= num_hash_func_
) {
526 CuckooNode
& curr_node
= tree
[bucket_to_replace_pos
];
527 (*buckets
)[static_cast<size_t>(curr_node
.bucket_id
)] =
528 (*buckets
)[static_cast<size_t>(tree
[curr_node
.parent_pos
].bucket_id
)];
529 bucket_to_replace_pos
= curr_node
.parent_pos
;
531 *bucket_id
= tree
[bucket_to_replace_pos
].bucket_id
;
536 std::string
CuckooTableBuilder::GetFileChecksum() const {
537 if (file_
!= nullptr) {
538 return file_
->GetFileChecksum();
540 return kUnknownFileChecksum
;
544 const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
545 if (file_
!= nullptr) {
546 return file_
->GetFileChecksumFuncName();
548 return kUnknownFileChecksumFuncName
;
552 } // namespace ROCKSDB_NAMESPACE
553 #endif // ROCKSDB_LITE