1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
7 #include "table/plain/plain_table_index.h"
11 #include "logging/logging.h"
12 #include "util/coding.h"
13 #include "util/hash.h"
15 namespace ROCKSDB_NAMESPACE
{
18 inline uint32_t GetBucketIdFromHash(uint32_t hash
, uint32_t num_buckets
) {
19 assert(num_buckets
> 0);
20 return hash
% num_buckets
;
24 Status
PlainTableIndex::InitFromRawData(Slice data
) {
25 if (!GetVarint32(&data
, &index_size_
)) {
26 return Status::Corruption("Couldn't read the index size!");
28 assert(index_size_
> 0);
29 if (!GetVarint32(&data
, &num_prefixes_
)) {
30 return Status::Corruption("Couldn't read the index size!");
33 static_cast<uint32_t>(data
.size()) - index_size_
* kOffsetLen
;
35 char* index_data_begin
= const_cast<char*>(data
.data());
36 index_
= reinterpret_cast<uint32_t*>(index_data_begin
);
37 sub_index_
= reinterpret_cast<char*>(index_
+ index_size_
);
41 PlainTableIndex::IndexSearchResult
PlainTableIndex::GetOffset(
42 uint32_t prefix_hash
, uint32_t* bucket_value
) const {
43 int bucket
= GetBucketIdFromHash(prefix_hash
, index_size_
);
44 GetUnaligned(index_
+ bucket
, bucket_value
);
45 if ((*bucket_value
& kSubIndexMask
) == kSubIndexMask
) {
46 *bucket_value
^= kSubIndexMask
;
49 if (*bucket_value
>= kMaxFileSize
) {
50 return kNoPrefixForBucket
;
52 // point directly to the file
57 void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash
,
59 if (num_records_in_current_group_
== kNumRecordsPerGroup
) {
60 current_group_
= AllocateNewGroup();
61 num_records_in_current_group_
= 0;
63 auto& new_record
= current_group_
[num_records_in_current_group_
++];
64 new_record
.hash
= hash
;
65 new_record
.offset
= offset
;
66 new_record
.next
= nullptr;
69 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice
,
70 uint32_t key_offset
) {
71 if (is_first_record_
|| prev_key_prefix_
!= key_prefix_slice
.ToString()) {
73 if (!is_first_record_
) {
74 keys_per_prefix_hist_
.Add(num_keys_per_prefix_
);
76 num_keys_per_prefix_
= 0;
77 prev_key_prefix_
= key_prefix_slice
.ToString();
78 prev_key_prefix_hash_
= GetSliceHash(key_prefix_slice
);
83 // Add an index key for every kIndexIntervalForSamePrefixKeys keys
84 record_list_
.AddRecord(prev_key_prefix_hash_
, key_offset
);
88 num_keys_per_prefix_
++;
89 if (index_sparseness_
== 0 || num_keys_per_prefix_
% index_sparseness_
== 0) {
92 is_first_record_
= false;
95 Slice
PlainTableIndexBuilder::Finish() {
97 std::vector
<IndexRecord
*> hash_to_offsets(index_size_
, nullptr);
98 std::vector
<uint32_t> entries_per_bucket(index_size_
, 0);
99 BucketizeIndexes(&hash_to_offsets
, &entries_per_bucket
);
101 keys_per_prefix_hist_
.Add(num_keys_per_prefix_
);
102 ROCKS_LOG_INFO(ioptions_
.logger
, "Number of Keys per prefix Histogram: %s",
103 keys_per_prefix_hist_
.ToString().c_str());
105 // From the temp data structure, populate indexes.
106 return FillIndexes(hash_to_offsets
, entries_per_bucket
);
109 void PlainTableIndexBuilder::AllocateIndex() {
110 if (prefix_extractor_
== nullptr || hash_table_ratio_
<= 0) {
111 // Fall back to pure binary search if the user fails to specify a prefix
115 double hash_table_size_multipier
= 1.0 / hash_table_ratio_
;
117 static_cast<uint32_t>(num_prefixes_
* hash_table_size_multipier
) + 1;
118 assert(index_size_
> 0);
122 void PlainTableIndexBuilder::BucketizeIndexes(
123 std::vector
<IndexRecord
*>* hash_to_offsets
,
124 std::vector
<uint32_t>* entries_per_bucket
) {
126 uint32_t prev_hash
= 0;
127 size_t num_records
= record_list_
.GetNumRecords();
128 for (size_t i
= 0; i
< num_records
; i
++) {
129 IndexRecord
* index_record
= record_list_
.At(i
);
130 uint32_t cur_hash
= index_record
->hash
;
131 if (first
|| prev_hash
!= cur_hash
) {
132 prev_hash
= cur_hash
;
135 uint32_t bucket
= GetBucketIdFromHash(cur_hash
, index_size_
);
136 IndexRecord
* prev_bucket_head
= (*hash_to_offsets
)[bucket
];
137 index_record
->next
= prev_bucket_head
;
138 (*hash_to_offsets
)[bucket
] = index_record
;
139 (*entries_per_bucket
)[bucket
]++;
143 for (auto entry_count
: *entries_per_bucket
) {
144 if (entry_count
<= 1) {
147 // Only buckets with more than 1 entry will have subindex.
148 sub_index_size_
+= VarintLength(entry_count
);
149 // total bytes needed to store these entries' in-file offsets.
150 sub_index_size_
+= entry_count
* PlainTableIndex::kOffsetLen
;
154 Slice
PlainTableIndexBuilder::FillIndexes(
155 const std::vector
<IndexRecord
*>& hash_to_offsets
,
156 const std::vector
<uint32_t>& entries_per_bucket
) {
157 ROCKS_LOG_DEBUG(ioptions_
.logger
,
158 "Reserving %" PRIu32
" bytes for plain table's sub_index",
160 auto total_allocate_size
= GetTotalSize();
161 char* allocated
= arena_
->AllocateAligned(
162 total_allocate_size
, huge_page_tlb_size_
, ioptions_
.logger
);
164 auto temp_ptr
= EncodeVarint32(allocated
, index_size_
);
166 reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr
, num_prefixes_
));
167 char* sub_index
= reinterpret_cast<char*>(index
+ index_size_
);
169 uint32_t sub_index_offset
= 0;
170 for (uint32_t i
= 0; i
< index_size_
; i
++) {
171 uint32_t num_keys_for_bucket
= entries_per_bucket
[i
];
172 switch (num_keys_for_bucket
) {
175 PutUnaligned(index
+ i
, (uint32_t)PlainTableIndex::kMaxFileSize
);
178 // point directly to the file offset
179 PutUnaligned(index
+ i
, hash_to_offsets
[i
]->offset
);
182 // point to second level indexes.
183 PutUnaligned(index
+ i
,
184 sub_index_offset
| PlainTableIndex::kSubIndexMask
);
185 char* prev_ptr
= &sub_index
[sub_index_offset
];
186 char* cur_ptr
= EncodeVarint32(prev_ptr
, num_keys_for_bucket
);
187 sub_index_offset
+= static_cast<uint32_t>(cur_ptr
- prev_ptr
);
188 char* sub_index_pos
= &sub_index
[sub_index_offset
];
189 IndexRecord
* record
= hash_to_offsets
[i
];
191 for (j
= num_keys_for_bucket
- 1; j
>= 0 && record
;
192 j
--, record
= record
->next
) {
193 EncodeFixed32(sub_index_pos
+ j
* sizeof(uint32_t), record
->offset
);
195 assert(j
== -1 && record
== nullptr);
196 sub_index_offset
+= PlainTableIndex::kOffsetLen
* num_keys_for_bucket
;
197 assert(sub_index_offset
<= sub_index_size_
);
201 assert(sub_index_offset
== sub_index_size_
);
203 ROCKS_LOG_DEBUG(ioptions_
.logger
,
204 "hash table size: %" PRIu32
", suffix_map length %" PRIu32
,
205 index_size_
, sub_index_size_
);
206 return Slice(allocated
, GetTotalSize());
209 const std::string
PlainTableIndexBuilder::kPlainTableIndexBlock
=
210 "PlainTableIndexBlock";
211 } // namespace ROCKSDB_NAMESPACE
213 #endif // ROCKSDB_LITE