1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
7 #include "table/cuckoo_table_builder.h"
15 #include "db/dbformat.h"
16 #include "rocksdb/env.h"
17 #include "rocksdb/table.h"
18 #include "table/block_builder.h"
19 #include "table/cuckoo_table_factory.h"
20 #include "table/format.h"
21 #include "table/meta_blocks.h"
22 #include "util/autovector.h"
23 #include "util/file_reader_writer.h"
24 #include "util/random.h"
25 #include "util/string_util.h"
28 const std::string
CuckooTablePropertyNames::kEmptyKey
=
29 "rocksdb.cuckoo.bucket.empty.key";
30 const std::string
CuckooTablePropertyNames::kNumHashFunc
=
31 "rocksdb.cuckoo.hash.num";
32 const std::string
CuckooTablePropertyNames::kHashTableSize
=
33 "rocksdb.cuckoo.hash.size";
34 const std::string
CuckooTablePropertyNames::kValueLength
=
35 "rocksdb.cuckoo.value.length";
36 const std::string
CuckooTablePropertyNames::kIsLastLevel
=
37 "rocksdb.cuckoo.file.islastlevel";
38 const std::string
CuckooTablePropertyNames::kCuckooBlockSize
=
39 "rocksdb.cuckoo.hash.cuckooblocksize";
40 const std::string
CuckooTablePropertyNames::kIdentityAsFirstHash
=
41 "rocksdb.cuckoo.hash.identityfirst";
42 const std::string
CuckooTablePropertyNames::kUseModuleHash
=
43 "rocksdb.cuckoo.hash.usemodule";
44 const std::string
CuckooTablePropertyNames::kUserKeyLength
=
45 "rocksdb.cuckoo.hash.userkeylength";
47 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
48 extern const uint64_t kCuckooTableMagicNumber
= 0x926789d0c5f17873ull
;
50 CuckooTableBuilder::CuckooTableBuilder(
51 WritableFileWriter
* file
, double max_hash_table_ratio
,
52 uint32_t max_num_hash_table
, uint32_t max_search_depth
,
53 const Comparator
* user_comparator
, uint32_t cuckoo_block_size
,
54 bool use_module_hash
, bool identity_as_first_hash
,
55 uint64_t (*get_slice_hash
)(const Slice
&, uint32_t, uint64_t),
56 uint32_t column_family_id
, const std::string
& column_family_name
)
59 max_hash_table_ratio_(max_hash_table_ratio
),
60 max_num_hash_func_(max_num_hash_table
),
61 max_search_depth_(max_search_depth
),
62 cuckoo_block_size_(std::max(1U, cuckoo_block_size
)),
63 hash_table_size_(use_module_hash
? 0 : 2),
64 is_last_level_file_(false),
65 has_seen_first_key_(false),
66 has_seen_first_value_(false),
71 ucomp_(user_comparator
),
72 use_module_hash_(use_module_hash
),
73 identity_as_first_hash_(identity_as_first_hash
),
74 get_slice_hash_(get_slice_hash
),
76 // Data is in a huge block.
77 properties_
.num_data_blocks
= 1;
78 properties_
.index_size
= 0;
79 properties_
.filter_size
= 0;
80 properties_
.column_family_id
= column_family_id
;
81 properties_
.column_family_name
= column_family_name
;
84 void CuckooTableBuilder::Add(const Slice
& key
, const Slice
& value
) {
85 if (num_entries_
>= kMaxVectorIdx
- 1) {
86 status_
= Status::NotSupported("Number of keys in a file must be < 2^32-1");
89 ParsedInternalKey ikey
;
90 if (!ParseInternalKey(key
, &ikey
)) {
91 status_
= Status::Corruption("Unable to parse key into inernal key.");
94 if (ikey
.type
!= kTypeDeletion
&& ikey
.type
!= kTypeValue
) {
95 status_
= Status::NotSupported("Unsupported key type " +
100 // Determine if we can ignore the sequence number and value type from
101 // internal keys by looking at sequence number from first key. We assume
102 // that if first key has a zero sequence number, then all the remaining
103 // keys will have zero seq. no.
104 if (!has_seen_first_key_
) {
105 is_last_level_file_
= ikey
.sequence
== 0;
106 has_seen_first_key_
= true;
107 smallest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
108 largest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
109 key_size_
= is_last_level_file_
? ikey
.user_key
.size() : key
.size();
111 if (key_size_
!= (is_last_level_file_
? ikey
.user_key
.size() : key
.size())) {
112 status_
= Status::NotSupported("all keys have to be the same size");
116 if (ikey
.type
== kTypeValue
) {
117 if (!has_seen_first_value_
) {
118 has_seen_first_value_
= true;
119 value_size_
= value
.size();
121 if (value_size_
!= value
.size()) {
122 status_
= Status::NotSupported("all values have to be the same size");
126 if (is_last_level_file_
) {
127 kvs_
.append(ikey
.user_key
.data(), ikey
.user_key
.size());
129 kvs_
.append(key
.data(), key
.size());
131 kvs_
.append(value
.data(), value
.size());
134 if (is_last_level_file_
) {
135 deleted_keys_
.append(ikey
.user_key
.data(), ikey
.user_key
.size());
137 deleted_keys_
.append(key
.data(), key
.size());
142 // In order to fill the empty buckets in the hash table, we identify a
143 // key which is not used so far (unused_user_key). We determine this by
144 // maintaining smallest and largest keys inserted so far in bytewise order
145 // and use them to find a key outside this range in Finish() operation.
146 // Note that this strategy is independent of user comparator used here.
147 if (ikey
.user_key
.compare(smallest_user_key_
) < 0) {
148 smallest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
149 } else if (ikey
.user_key
.compare(largest_user_key_
) > 0) {
150 largest_user_key_
.assign(ikey
.user_key
.data(), ikey
.user_key
.size());
152 if (!use_module_hash_
) {
153 if (hash_table_size_
< num_entries_
/ max_hash_table_ratio_
) {
154 hash_table_size_
*= 2;
159 bool CuckooTableBuilder::IsDeletedKey(uint64_t idx
) const {
161 return idx
>= num_values_
;
164 Slice
CuckooTableBuilder::GetKey(uint64_t idx
) const {
166 if (IsDeletedKey(idx
)) {
167 return Slice(&deleted_keys_
[static_cast<size_t>((idx
- num_values_
) * key_size_
)], static_cast<size_t>(key_size_
));
169 return Slice(&kvs_
[static_cast<size_t>(idx
* (key_size_
+ value_size_
))], static_cast<size_t>(key_size_
));
172 Slice
CuckooTableBuilder::GetUserKey(uint64_t idx
) const {
174 return is_last_level_file_
? GetKey(idx
) : ExtractUserKey(GetKey(idx
));
177 Slice
CuckooTableBuilder::GetValue(uint64_t idx
) const {
179 if (IsDeletedKey(idx
)) {
180 static std::string
empty_value(static_cast<unsigned int>(value_size_
), 'a');
181 return Slice(empty_value
);
183 return Slice(&kvs_
[static_cast<size_t>(idx
* (key_size_
+ value_size_
) + key_size_
)], static_cast<size_t>(value_size_
));
186 Status
CuckooTableBuilder::MakeHashTable(std::vector
<CuckooBucket
>* buckets
) {
187 buckets
->resize(static_cast<size_t>(hash_table_size_
+ cuckoo_block_size_
- 1));
188 uint32_t make_space_for_key_call_id
= 0;
189 for (uint32_t vector_idx
= 0; vector_idx
< num_entries_
; vector_idx
++) {
190 uint64_t bucket_id
= 0;
191 bool bucket_found
= false;
192 autovector
<uint64_t> hash_vals
;
193 Slice user_key
= GetUserKey(vector_idx
);
194 for (uint32_t hash_cnt
= 0; hash_cnt
< num_hash_func_
&& !bucket_found
;
196 uint64_t hash_val
= CuckooHash(user_key
, hash_cnt
, use_module_hash_
,
197 hash_table_size_
, identity_as_first_hash_
, get_slice_hash_
);
198 // If there is a collision, check next cuckoo_block_size_ locations for
199 // empty locations. While checking, if we reach end of the hash table,
200 // stop searching and proceed for next hash function.
201 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
202 ++block_idx
, ++hash_val
) {
203 if ((*buckets
)[static_cast<size_t>(hash_val
)].vector_idx
== kMaxVectorIdx
) {
204 bucket_id
= hash_val
;
208 if (ucomp_
->Compare(user_key
,
209 GetUserKey((*buckets
)[static_cast<size_t>(hash_val
)].vector_idx
)) == 0) {
210 return Status::NotSupported("Same key is being inserted again.");
212 hash_vals
.push_back(hash_val
);
216 while (!bucket_found
&& !MakeSpaceForKey(hash_vals
,
217 ++make_space_for_key_call_id
, buckets
, &bucket_id
)) {
218 // Rehash by increashing number of hash tables.
219 if (num_hash_func_
>= max_num_hash_func_
) {
220 return Status::NotSupported("Too many collisions. Unable to hash.");
222 // We don't really need to rehash the entire table because old hashes are
223 // still valid and we only increased the number of hash functions.
224 uint64_t hash_val
= CuckooHash(user_key
, num_hash_func_
, use_module_hash_
,
225 hash_table_size_
, identity_as_first_hash_
, get_slice_hash_
);
227 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
228 ++block_idx
, ++hash_val
) {
229 if ((*buckets
)[static_cast<size_t>(hash_val
)].vector_idx
== kMaxVectorIdx
) {
231 bucket_id
= hash_val
;
234 hash_vals
.push_back(hash_val
);
238 (*buckets
)[static_cast<size_t>(bucket_id
)].vector_idx
= vector_idx
;
243 Status
CuckooTableBuilder::Finish() {
246 std::vector
<CuckooBucket
> buckets
;
248 std::string unused_bucket
;
249 if (num_entries_
> 0) {
250 // Calculate the real hash size if module hash is enabled.
251 if (use_module_hash_
) {
253 static_cast<uint64_t>(num_entries_
/ max_hash_table_ratio_
);
255 s
= MakeHashTable(&buckets
);
259 // Determine unused_user_key to fill empty buckets.
260 std::string unused_user_key
= smallest_user_key_
;
261 int curr_pos
= static_cast<int>(unused_user_key
.size()) - 1;
262 while (curr_pos
>= 0) {
263 --unused_user_key
[curr_pos
];
264 if (Slice(unused_user_key
).compare(smallest_user_key_
) < 0) {
270 // Try using the largest key to identify an unused key.
271 unused_user_key
= largest_user_key_
;
272 curr_pos
= static_cast<int>(unused_user_key
.size()) - 1;
273 while (curr_pos
>= 0) {
274 ++unused_user_key
[curr_pos
];
275 if (Slice(unused_user_key
).compare(largest_user_key_
) > 0) {
282 return Status::Corruption("Unable to find unused key");
284 if (is_last_level_file_
) {
285 unused_bucket
= unused_user_key
;
287 ParsedInternalKey
ikey(unused_user_key
, 0, kTypeValue
);
288 AppendInternalKey(&unused_bucket
, ikey
);
291 properties_
.num_entries
= num_entries_
;
292 properties_
.fixed_key_len
= key_size_
;
293 properties_
.user_collected_properties
[
294 CuckooTablePropertyNames::kValueLength
].assign(
295 reinterpret_cast<const char*>(&value_size_
), sizeof(value_size_
));
297 uint64_t bucket_size
= key_size_
+ value_size_
;
298 unused_bucket
.resize(static_cast<size_t>(bucket_size
), 'a');
300 uint32_t num_added
= 0;
301 for (auto& bucket
: buckets
) {
302 if (bucket
.vector_idx
== kMaxVectorIdx
) {
303 s
= file_
->Append(Slice(unused_bucket
));
306 s
= file_
->Append(GetKey(bucket
.vector_idx
));
308 if (value_size_
> 0) {
309 s
= file_
->Append(GetValue(bucket
.vector_idx
));
317 assert(num_added
== NumEntries());
318 properties_
.raw_key_size
= num_added
* properties_
.fixed_key_len
;
319 properties_
.raw_value_size
= num_added
* value_size_
;
321 uint64_t offset
= buckets
.size() * bucket_size
;
322 properties_
.data_size
= offset
;
323 unused_bucket
.resize(static_cast<size_t>(properties_
.fixed_key_len
));
324 properties_
.user_collected_properties
[
325 CuckooTablePropertyNames::kEmptyKey
] = unused_bucket
;
326 properties_
.user_collected_properties
[
327 CuckooTablePropertyNames::kNumHashFunc
].assign(
328 reinterpret_cast<char*>(&num_hash_func_
), sizeof(num_hash_func_
));
330 properties_
.user_collected_properties
[
331 CuckooTablePropertyNames::kHashTableSize
].assign(
332 reinterpret_cast<const char*>(&hash_table_size_
),
333 sizeof(hash_table_size_
));
334 properties_
.user_collected_properties
[
335 CuckooTablePropertyNames::kIsLastLevel
].assign(
336 reinterpret_cast<const char*>(&is_last_level_file_
),
337 sizeof(is_last_level_file_
));
338 properties_
.user_collected_properties
[
339 CuckooTablePropertyNames::kCuckooBlockSize
].assign(
340 reinterpret_cast<const char*>(&cuckoo_block_size_
),
341 sizeof(cuckoo_block_size_
));
342 properties_
.user_collected_properties
[
343 CuckooTablePropertyNames::kIdentityAsFirstHash
].assign(
344 reinterpret_cast<const char*>(&identity_as_first_hash_
),
345 sizeof(identity_as_first_hash_
));
346 properties_
.user_collected_properties
[
347 CuckooTablePropertyNames::kUseModuleHash
].assign(
348 reinterpret_cast<const char*>(&use_module_hash_
),
349 sizeof(use_module_hash_
));
350 uint32_t user_key_len
= static_cast<uint32_t>(smallest_user_key_
.size());
351 properties_
.user_collected_properties
[
352 CuckooTablePropertyNames::kUserKeyLength
].assign(
353 reinterpret_cast<const char*>(&user_key_len
),
354 sizeof(user_key_len
));
356 // Write meta blocks.
357 MetaIndexBuilder meta_index_builder
;
358 PropertyBlockBuilder property_block_builder
;
360 property_block_builder
.AddTableProperty(properties_
);
361 property_block_builder
.Add(properties_
.user_collected_properties
);
362 Slice property_block
= property_block_builder
.Finish();
363 BlockHandle property_block_handle
;
364 property_block_handle
.set_offset(offset
);
365 property_block_handle
.set_size(property_block
.size());
366 s
= file_
->Append(property_block
);
367 offset
+= property_block
.size();
372 meta_index_builder
.Add(kPropertiesBlock
, property_block_handle
);
373 Slice meta_index_block
= meta_index_builder
.Finish();
375 BlockHandle meta_index_block_handle
;
376 meta_index_block_handle
.set_offset(offset
);
377 meta_index_block_handle
.set_size(meta_index_block
.size());
378 s
= file_
->Append(meta_index_block
);
383 Footer
footer(kCuckooTableMagicNumber
, 1);
384 footer
.set_metaindex_handle(meta_index_block_handle
);
385 footer
.set_index_handle(BlockHandle::NullBlockHandle());
386 std::string footer_encoding
;
387 footer
.EncodeTo(&footer_encoding
);
388 s
= file_
->Append(footer_encoding
);
392 void CuckooTableBuilder::Abandon() {
397 uint64_t CuckooTableBuilder::NumEntries() const {
401 uint64_t CuckooTableBuilder::FileSize() const {
403 return file_
->GetFileSize();
404 } else if (num_entries_
== 0) {
408 if (use_module_hash_
) {
409 return static_cast<uint64_t>((key_size_
+ value_size_
) *
410 num_entries_
/ max_hash_table_ratio_
);
412 // Account for buckets being a power of two.
413 // As elements are added, file size remains constant for a while and
414 // doubles its size. Since compaction algorithm stops adding elements
415 // only after it exceeds the file limit, we account for the extra element
417 uint64_t expected_hash_table_size
= hash_table_size_
;
418 if (expected_hash_table_size
< (num_entries_
+ 1) / max_hash_table_ratio_
) {
419 expected_hash_table_size
*= 2;
421 return (key_size_
+ value_size_
) * expected_hash_table_size
- 1;
425 // This method is invoked when there is no place to insert the target key.
426 // It searches for a set of elements that can be moved to accommodate target
427 // key. The search is a BFS graph traversal with first level (hash_vals)
428 // being all the buckets target key could go to.
429 // Then, from each node (curr_node), we find all the buckets that curr_node
430 // could go to. They form the children of curr_node in the tree.
431 // We continue the traversal until we find an empty bucket, in which case, we
432 // move all elements along the path from first level to this empty bucket, to
433 // make space for target key which is inserted at first level (*bucket_id).
434 // If tree depth exceedes max depth, we return false indicating failure.
435 bool CuckooTableBuilder::MakeSpaceForKey(
436 const autovector
<uint64_t>& hash_vals
,
437 const uint32_t make_space_for_key_call_id
,
438 std::vector
<CuckooBucket
>* buckets
, uint64_t* bucket_id
) {
443 CuckooNode(uint64_t _bucket_id
, uint32_t _depth
, int _parent_pos
)
444 : bucket_id(_bucket_id
), depth(_depth
), parent_pos(_parent_pos
) {}
446 // This is BFS search tree that is stored simply as a vector.
447 // Each node stores the index of parent node in the vector.
448 std::vector
<CuckooNode
> tree
;
449 // We want to identify already visited buckets in the current method call so
450 // that we don't add same buckets again for exploration in the tree.
451 // We do this by maintaining a count of current method call in
452 // make_space_for_key_call_id, which acts as a unique id for this invocation
453 // of the method. We store this number into the nodes that we explore in
454 // current method call.
455 // It is unlikely for the increment operation to overflow because the maximum
456 // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
457 for (uint32_t hash_cnt
= 0; hash_cnt
< num_hash_func_
; ++hash_cnt
) {
458 uint64_t bid
= hash_vals
[hash_cnt
];
459 (*buckets
)[static_cast<size_t>(bid
)].make_space_for_key_call_id
= make_space_for_key_call_id
;
460 tree
.push_back(CuckooNode(bid
, 0, 0));
462 bool null_found
= false;
463 uint32_t curr_pos
= 0;
464 while (!null_found
&& curr_pos
< tree
.size()) {
465 CuckooNode
& curr_node
= tree
[curr_pos
];
466 uint32_t curr_depth
= curr_node
.depth
;
467 if (curr_depth
>= max_search_depth_
) {
470 CuckooBucket
& curr_bucket
= (*buckets
)[static_cast<size_t>(curr_node
.bucket_id
)];
471 for (uint32_t hash_cnt
= 0;
472 hash_cnt
< num_hash_func_
&& !null_found
; ++hash_cnt
) {
473 uint64_t child_bucket_id
= CuckooHash(GetUserKey(curr_bucket
.vector_idx
),
474 hash_cnt
, use_module_hash_
, hash_table_size_
, identity_as_first_hash_
,
476 // Iterate inside Cuckoo Block.
477 for (uint32_t block_idx
= 0; block_idx
< cuckoo_block_size_
;
478 ++block_idx
, ++child_bucket_id
) {
479 if ((*buckets
)[static_cast<size_t>(child_bucket_id
)].make_space_for_key_call_id
==
480 make_space_for_key_call_id
) {
483 (*buckets
)[static_cast<size_t>(child_bucket_id
)].make_space_for_key_call_id
=
484 make_space_for_key_call_id
;
485 tree
.push_back(CuckooNode(child_bucket_id
, curr_depth
+ 1,
487 if ((*buckets
)[static_cast<size_t>(child_bucket_id
)].vector_idx
== kMaxVectorIdx
) {
497 // There is an empty node in tree.back(). Now, traverse the path from this
498 // empty node to top of the tree and at every node in the path, replace
499 // child with the parent. Stop when first level is reached in the tree
500 // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
501 // this location in first level for target key to be inserted.
502 uint32_t bucket_to_replace_pos
= static_cast<uint32_t>(tree
.size()) - 1;
503 while (bucket_to_replace_pos
>= num_hash_func_
) {
504 CuckooNode
& curr_node
= tree
[bucket_to_replace_pos
];
505 (*buckets
)[static_cast<size_t>(curr_node
.bucket_id
)] =
506 (*buckets
)[static_cast<size_t>(tree
[curr_node
.parent_pos
].bucket_id
)];
507 bucket_to_replace_pos
= curr_node
.parent_pos
;
509 *bucket_id
= tree
[bucket_to_replace_pos
].bucket_id
;
514 } // namespace rocksdb
515 #endif // ROCKSDB_LITE