1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
10 #include "table/plain/plain_table_index.h"
11 #include "util/coding.h"
12 #include "util/hash.h"
14 namespace ROCKSDB_NAMESPACE
{
17 inline uint32_t GetBucketIdFromHash(uint32_t hash
, uint32_t num_buckets
) {
18 assert(num_buckets
> 0);
19 return hash
% num_buckets
;
23 Status
PlainTableIndex::InitFromRawData(Slice data
) {
24 if (!GetVarint32(&data
, &index_size_
)) {
25 return Status::Corruption("Couldn't read the index size!");
27 assert(index_size_
> 0);
28 if (!GetVarint32(&data
, &num_prefixes_
)) {
29 return Status::Corruption("Couldn't read the index size!");
32 static_cast<uint32_t>(data
.size()) - index_size_
* kOffsetLen
;
34 char* index_data_begin
= const_cast<char*>(data
.data());
35 index_
= reinterpret_cast<uint32_t*>(index_data_begin
);
36 sub_index_
= reinterpret_cast<char*>(index_
+ index_size_
);
40 PlainTableIndex::IndexSearchResult
PlainTableIndex::GetOffset(
41 uint32_t prefix_hash
, uint32_t* bucket_value
) const {
42 int bucket
= GetBucketIdFromHash(prefix_hash
, index_size_
);
43 GetUnaligned(index_
+ bucket
, bucket_value
);
44 if ((*bucket_value
& kSubIndexMask
) == kSubIndexMask
) {
45 *bucket_value
^= kSubIndexMask
;
48 if (*bucket_value
>= kMaxFileSize
) {
49 return kNoPrefixForBucket
;
51 // point directly to the file
56 void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash
,
58 if (num_records_in_current_group_
== kNumRecordsPerGroup
) {
59 current_group_
= AllocateNewGroup();
60 num_records_in_current_group_
= 0;
62 auto& new_record
= current_group_
[num_records_in_current_group_
++];
63 new_record
.hash
= hash
;
64 new_record
.offset
= offset
;
65 new_record
.next
= nullptr;
68 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice
,
69 uint32_t key_offset
) {
70 if (is_first_record_
|| prev_key_prefix_
!= key_prefix_slice
.ToString()) {
72 if (!is_first_record_
) {
73 keys_per_prefix_hist_
.Add(num_keys_per_prefix_
);
75 num_keys_per_prefix_
= 0;
76 prev_key_prefix_
= key_prefix_slice
.ToString();
77 prev_key_prefix_hash_
= GetSliceHash(key_prefix_slice
);
82 // Add an index key for every kIndexIntervalForSamePrefixKeys keys
83 record_list_
.AddRecord(prev_key_prefix_hash_
, key_offset
);
87 num_keys_per_prefix_
++;
88 if (index_sparseness_
== 0 || num_keys_per_prefix_
% index_sparseness_
== 0) {
91 is_first_record_
= false;
94 Slice
PlainTableIndexBuilder::Finish() {
96 std::vector
<IndexRecord
*> hash_to_offsets(index_size_
, nullptr);
97 std::vector
<uint32_t> entries_per_bucket(index_size_
, 0);
98 BucketizeIndexes(&hash_to_offsets
, &entries_per_bucket
);
100 keys_per_prefix_hist_
.Add(num_keys_per_prefix_
);
101 ROCKS_LOG_INFO(ioptions_
.info_log
, "Number of Keys per prefix Histogram: %s",
102 keys_per_prefix_hist_
.ToString().c_str());
104 // From the temp data structure, populate indexes.
105 return FillIndexes(hash_to_offsets
, entries_per_bucket
);
108 void PlainTableIndexBuilder::AllocateIndex() {
109 if (prefix_extractor_
== nullptr || hash_table_ratio_
<= 0) {
110 // Fall back to pure binary search if the user fails to specify a prefix
114 double hash_table_size_multipier
= 1.0 / hash_table_ratio_
;
116 static_cast<uint32_t>(num_prefixes_
* hash_table_size_multipier
) + 1;
117 assert(index_size_
> 0);
121 void PlainTableIndexBuilder::BucketizeIndexes(
122 std::vector
<IndexRecord
*>* hash_to_offsets
,
123 std::vector
<uint32_t>* entries_per_bucket
) {
125 uint32_t prev_hash
= 0;
126 size_t num_records
= record_list_
.GetNumRecords();
127 for (size_t i
= 0; i
< num_records
; i
++) {
128 IndexRecord
* index_record
= record_list_
.At(i
);
129 uint32_t cur_hash
= index_record
->hash
;
130 if (first
|| prev_hash
!= cur_hash
) {
131 prev_hash
= cur_hash
;
134 uint32_t bucket
= GetBucketIdFromHash(cur_hash
, index_size_
);
135 IndexRecord
* prev_bucket_head
= (*hash_to_offsets
)[bucket
];
136 index_record
->next
= prev_bucket_head
;
137 (*hash_to_offsets
)[bucket
] = index_record
;
138 (*entries_per_bucket
)[bucket
]++;
142 for (auto entry_count
: *entries_per_bucket
) {
143 if (entry_count
<= 1) {
146 // Only buckets with more than 1 entry will have subindex.
147 sub_index_size_
+= VarintLength(entry_count
);
148 // total bytes needed to store these entries' in-file offsets.
149 sub_index_size_
+= entry_count
* PlainTableIndex::kOffsetLen
;
153 Slice
PlainTableIndexBuilder::FillIndexes(
154 const std::vector
<IndexRecord
*>& hash_to_offsets
,
155 const std::vector
<uint32_t>& entries_per_bucket
) {
156 ROCKS_LOG_DEBUG(ioptions_
.info_log
,
157 "Reserving %" PRIu32
" bytes for plain table's sub_index",
159 auto total_allocate_size
= GetTotalSize();
160 char* allocated
= arena_
->AllocateAligned(
161 total_allocate_size
, huge_page_tlb_size_
, ioptions_
.info_log
);
163 auto temp_ptr
= EncodeVarint32(allocated
, index_size_
);
165 reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr
, num_prefixes_
));
166 char* sub_index
= reinterpret_cast<char*>(index
+ index_size_
);
168 uint32_t sub_index_offset
= 0;
169 for (uint32_t i
= 0; i
< index_size_
; i
++) {
170 uint32_t num_keys_for_bucket
= entries_per_bucket
[i
];
171 switch (num_keys_for_bucket
) {
174 PutUnaligned(index
+ i
, (uint32_t)PlainTableIndex::kMaxFileSize
);
177 // point directly to the file offset
178 PutUnaligned(index
+ i
, hash_to_offsets
[i
]->offset
);
181 // point to second level indexes.
182 PutUnaligned(index
+ i
, sub_index_offset
| PlainTableIndex::kSubIndexMask
);
183 char* prev_ptr
= &sub_index
[sub_index_offset
];
184 char* cur_ptr
= EncodeVarint32(prev_ptr
, num_keys_for_bucket
);
185 sub_index_offset
+= static_cast<uint32_t>(cur_ptr
- prev_ptr
);
186 char* sub_index_pos
= &sub_index
[sub_index_offset
];
187 IndexRecord
* record
= hash_to_offsets
[i
];
189 for (j
= num_keys_for_bucket
- 1; j
>= 0 && record
;
190 j
--, record
= record
->next
) {
191 EncodeFixed32(sub_index_pos
+ j
* sizeof(uint32_t), record
->offset
);
193 assert(j
== -1 && record
== nullptr);
194 sub_index_offset
+= PlainTableIndex::kOffsetLen
* num_keys_for_bucket
;
195 assert(sub_index_offset
<= sub_index_size_
);
199 assert(sub_index_offset
== sub_index_size_
);
201 ROCKS_LOG_DEBUG(ioptions_
.info_log
,
202 "hash table size: %" PRIu32
", suffix_map length %" PRIu32
,
203 index_size_
, sub_index_size_
);
204 return Slice(allocated
, GetTotalSize());
207 const std::string
PlainTableIndexBuilder::kPlainTableIndexBlock
=
208 "PlainTableIndexBlock";
209 }; // namespace ROCKSDB_NAMESPACE
211 #endif // ROCKSDB_LITE