]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | |
6 | #ifndef ROCKSDB_LITE | |
1e59de90 | 7 | #include "table/plain/plain_table_index.h" |
7c673cae | 8 | |
f67539c2 | 9 | #include <cinttypes> |
7c673cae | 10 | |
1e59de90 | 11 | #include "logging/logging.h" |
7c673cae FG |
12 | #include "util/coding.h" |
13 | #include "util/hash.h" | |
14 | ||
f67539c2 | 15 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
16 | |
17 | namespace { | |
18 | inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { | |
19 | assert(num_buckets > 0); | |
20 | return hash % num_buckets; | |
21 | } | |
1e59de90 | 22 | } // namespace |
7c673cae FG |
23 | |
24 | Status PlainTableIndex::InitFromRawData(Slice data) { | |
25 | if (!GetVarint32(&data, &index_size_)) { | |
26 | return Status::Corruption("Couldn't read the index size!"); | |
27 | } | |
28 | assert(index_size_ > 0); | |
29 | if (!GetVarint32(&data, &num_prefixes_)) { | |
30 | return Status::Corruption("Couldn't read the index size!"); | |
31 | } | |
32 | sub_index_size_ = | |
33 | static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen; | |
34 | ||
35 | char* index_data_begin = const_cast<char*>(data.data()); | |
36 | index_ = reinterpret_cast<uint32_t*>(index_data_begin); | |
37 | sub_index_ = reinterpret_cast<char*>(index_ + index_size_); | |
38 | return Status::OK(); | |
39 | } | |
40 | ||
41 | PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( | |
42 | uint32_t prefix_hash, uint32_t* bucket_value) const { | |
43 | int bucket = GetBucketIdFromHash(prefix_hash, index_size_); | |
44 | GetUnaligned(index_ + bucket, bucket_value); | |
45 | if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { | |
46 | *bucket_value ^= kSubIndexMask; | |
47 | return kSubindex; | |
48 | } | |
49 | if (*bucket_value >= kMaxFileSize) { | |
50 | return kNoPrefixForBucket; | |
51 | } else { | |
52 | // point directly to the file | |
53 | return kDirectToFile; | |
54 | } | |
55 | } | |
56 | ||
57 | void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, | |
58 | uint32_t offset) { | |
59 | if (num_records_in_current_group_ == kNumRecordsPerGroup) { | |
60 | current_group_ = AllocateNewGroup(); | |
61 | num_records_in_current_group_ = 0; | |
62 | } | |
63 | auto& new_record = current_group_[num_records_in_current_group_++]; | |
64 | new_record.hash = hash; | |
65 | new_record.offset = offset; | |
66 | new_record.next = nullptr; | |
67 | } | |
68 | ||
69 | void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, | |
70 | uint32_t key_offset) { | |
71 | if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { | |
72 | ++num_prefixes_; | |
73 | if (!is_first_record_) { | |
74 | keys_per_prefix_hist_.Add(num_keys_per_prefix_); | |
75 | } | |
76 | num_keys_per_prefix_ = 0; | |
77 | prev_key_prefix_ = key_prefix_slice.ToString(); | |
78 | prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); | |
79 | due_index_ = true; | |
80 | } | |
81 | ||
82 | if (due_index_) { | |
83 | // Add an index key for every kIndexIntervalForSamePrefixKeys keys | |
84 | record_list_.AddRecord(prev_key_prefix_hash_, key_offset); | |
85 | due_index_ = false; | |
86 | } | |
87 | ||
88 | num_keys_per_prefix_++; | |
89 | if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { | |
90 | due_index_ = true; | |
91 | } | |
92 | is_first_record_ = false; | |
93 | } | |
94 | ||
95 | Slice PlainTableIndexBuilder::Finish() { | |
96 | AllocateIndex(); | |
97 | std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); | |
98 | std::vector<uint32_t> entries_per_bucket(index_size_, 0); | |
99 | BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); | |
100 | ||
101 | keys_per_prefix_hist_.Add(num_keys_per_prefix_); | |
1e59de90 | 102 | ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s", |
7c673cae FG |
103 | keys_per_prefix_hist_.ToString().c_str()); |
104 | ||
105 | // From the temp data structure, populate indexes. | |
106 | return FillIndexes(hash_to_offsets, entries_per_bucket); | |
107 | } | |
108 | ||
109 | void PlainTableIndexBuilder::AllocateIndex() { | |
110 | if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { | |
111 | // Fall back to pure binary search if the user fails to specify a prefix | |
112 | // extractor. | |
113 | index_size_ = 1; | |
114 | } else { | |
115 | double hash_table_size_multipier = 1.0 / hash_table_ratio_; | |
116 | index_size_ = | |
1e59de90 | 117 | static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1; |
7c673cae FG |
118 | assert(index_size_ > 0); |
119 | } | |
120 | } | |
121 | ||
122 | void PlainTableIndexBuilder::BucketizeIndexes( | |
123 | std::vector<IndexRecord*>* hash_to_offsets, | |
124 | std::vector<uint32_t>* entries_per_bucket) { | |
125 | bool first = true; | |
126 | uint32_t prev_hash = 0; | |
127 | size_t num_records = record_list_.GetNumRecords(); | |
128 | for (size_t i = 0; i < num_records; i++) { | |
129 | IndexRecord* index_record = record_list_.At(i); | |
130 | uint32_t cur_hash = index_record->hash; | |
131 | if (first || prev_hash != cur_hash) { | |
132 | prev_hash = cur_hash; | |
133 | first = false; | |
134 | } | |
135 | uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); | |
136 | IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; | |
137 | index_record->next = prev_bucket_head; | |
138 | (*hash_to_offsets)[bucket] = index_record; | |
139 | (*entries_per_bucket)[bucket]++; | |
140 | } | |
141 | ||
142 | sub_index_size_ = 0; | |
143 | for (auto entry_count : *entries_per_bucket) { | |
144 | if (entry_count <= 1) { | |
145 | continue; | |
146 | } | |
147 | // Only buckets with more than 1 entry will have subindex. | |
148 | sub_index_size_ += VarintLength(entry_count); | |
149 | // total bytes needed to store these entries' in-file offsets. | |
150 | sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; | |
151 | } | |
152 | } | |
153 | ||
154 | Slice PlainTableIndexBuilder::FillIndexes( | |
155 | const std::vector<IndexRecord*>& hash_to_offsets, | |
156 | const std::vector<uint32_t>& entries_per_bucket) { | |
1e59de90 | 157 | ROCKS_LOG_DEBUG(ioptions_.logger, |
7c673cae FG |
158 | "Reserving %" PRIu32 " bytes for plain table's sub_index", |
159 | sub_index_size_); | |
160 | auto total_allocate_size = GetTotalSize(); | |
161 | char* allocated = arena_->AllocateAligned( | |
1e59de90 | 162 | total_allocate_size, huge_page_tlb_size_, ioptions_.logger); |
7c673cae FG |
163 | |
164 | auto temp_ptr = EncodeVarint32(allocated, index_size_); | |
165 | uint32_t* index = | |
166 | reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); | |
167 | char* sub_index = reinterpret_cast<char*>(index + index_size_); | |
168 | ||
169 | uint32_t sub_index_offset = 0; | |
170 | for (uint32_t i = 0; i < index_size_; i++) { | |
171 | uint32_t num_keys_for_bucket = entries_per_bucket[i]; | |
172 | switch (num_keys_for_bucket) { | |
173 | case 0: | |
174 | // No key for bucket | |
175 | PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize); | |
176 | break; | |
177 | case 1: | |
178 | // point directly to the file offset | |
179 | PutUnaligned(index + i, hash_to_offsets[i]->offset); | |
180 | break; | |
181 | default: | |
182 | // point to second level indexes. | |
1e59de90 TL |
183 | PutUnaligned(index + i, |
184 | sub_index_offset | PlainTableIndex::kSubIndexMask); | |
7c673cae FG |
185 | char* prev_ptr = &sub_index[sub_index_offset]; |
186 | char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); | |
187 | sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr); | |
188 | char* sub_index_pos = &sub_index[sub_index_offset]; | |
189 | IndexRecord* record = hash_to_offsets[i]; | |
190 | int j; | |
191 | for (j = num_keys_for_bucket - 1; j >= 0 && record; | |
192 | j--, record = record->next) { | |
193 | EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); | |
194 | } | |
195 | assert(j == -1 && record == nullptr); | |
196 | sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; | |
197 | assert(sub_index_offset <= sub_index_size_); | |
198 | break; | |
199 | } | |
200 | } | |
201 | assert(sub_index_offset == sub_index_size_); | |
202 | ||
1e59de90 | 203 | ROCKS_LOG_DEBUG(ioptions_.logger, |
494da23a | 204 | "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, |
7c673cae FG |
205 | index_size_, sub_index_size_); |
206 | return Slice(allocated, GetTotalSize()); | |
207 | } | |
208 | ||
209 | const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = | |
210 | "PlainTableIndexBlock"; | |
1e59de90 | 211 | } // namespace ROCKSDB_NAMESPACE |
7c673cae FG |
212 | |
213 | #endif // ROCKSDB_LITE |