]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain/plain_table_index.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_index.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #ifndef ROCKSDB_LITE
7
8 #include <cinttypes>
9
10 #include "table/plain/plain_table_index.h"
11 #include "util/coding.h"
12 #include "util/hash.h"
13
14 namespace ROCKSDB_NAMESPACE {
15
16 namespace {
17 inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
18 assert(num_buckets > 0);
19 return hash % num_buckets;
20 }
21 }
22
23 Status PlainTableIndex::InitFromRawData(Slice data) {
24 if (!GetVarint32(&data, &index_size_)) {
25 return Status::Corruption("Couldn't read the index size!");
26 }
27 assert(index_size_ > 0);
28 if (!GetVarint32(&data, &num_prefixes_)) {
29 return Status::Corruption("Couldn't read the index size!");
30 }
31 sub_index_size_ =
32 static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
33
34 char* index_data_begin = const_cast<char*>(data.data());
35 index_ = reinterpret_cast<uint32_t*>(index_data_begin);
36 sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
37 return Status::OK();
38 }
39
40 PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
41 uint32_t prefix_hash, uint32_t* bucket_value) const {
42 int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
43 GetUnaligned(index_ + bucket, bucket_value);
44 if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
45 *bucket_value ^= kSubIndexMask;
46 return kSubindex;
47 }
48 if (*bucket_value >= kMaxFileSize) {
49 return kNoPrefixForBucket;
50 } else {
51 // point directly to the file
52 return kDirectToFile;
53 }
54 }
55
56 void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
57 uint32_t offset) {
58 if (num_records_in_current_group_ == kNumRecordsPerGroup) {
59 current_group_ = AllocateNewGroup();
60 num_records_in_current_group_ = 0;
61 }
62 auto& new_record = current_group_[num_records_in_current_group_++];
63 new_record.hash = hash;
64 new_record.offset = offset;
65 new_record.next = nullptr;
66 }
67
68 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
69 uint32_t key_offset) {
70 if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
71 ++num_prefixes_;
72 if (!is_first_record_) {
73 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
74 }
75 num_keys_per_prefix_ = 0;
76 prev_key_prefix_ = key_prefix_slice.ToString();
77 prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
78 due_index_ = true;
79 }
80
81 if (due_index_) {
82 // Add an index key for every kIndexIntervalForSamePrefixKeys keys
83 record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
84 due_index_ = false;
85 }
86
87 num_keys_per_prefix_++;
88 if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
89 due_index_ = true;
90 }
91 is_first_record_ = false;
92 }
93
94 Slice PlainTableIndexBuilder::Finish() {
95 AllocateIndex();
96 std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
97 std::vector<uint32_t> entries_per_bucket(index_size_, 0);
98 BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
99
100 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
101 ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
102 keys_per_prefix_hist_.ToString().c_str());
103
104 // From the temp data structure, populate indexes.
105 return FillIndexes(hash_to_offsets, entries_per_bucket);
106 }
107
108 void PlainTableIndexBuilder::AllocateIndex() {
109 if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
110 // Fall back to pure binary search if the user fails to specify a prefix
111 // extractor.
112 index_size_ = 1;
113 } else {
114 double hash_table_size_multipier = 1.0 / hash_table_ratio_;
115 index_size_ =
116 static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
117 assert(index_size_ > 0);
118 }
119 }
120
121 void PlainTableIndexBuilder::BucketizeIndexes(
122 std::vector<IndexRecord*>* hash_to_offsets,
123 std::vector<uint32_t>* entries_per_bucket) {
124 bool first = true;
125 uint32_t prev_hash = 0;
126 size_t num_records = record_list_.GetNumRecords();
127 for (size_t i = 0; i < num_records; i++) {
128 IndexRecord* index_record = record_list_.At(i);
129 uint32_t cur_hash = index_record->hash;
130 if (first || prev_hash != cur_hash) {
131 prev_hash = cur_hash;
132 first = false;
133 }
134 uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
135 IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
136 index_record->next = prev_bucket_head;
137 (*hash_to_offsets)[bucket] = index_record;
138 (*entries_per_bucket)[bucket]++;
139 }
140
141 sub_index_size_ = 0;
142 for (auto entry_count : *entries_per_bucket) {
143 if (entry_count <= 1) {
144 continue;
145 }
146 // Only buckets with more than 1 entry will have subindex.
147 sub_index_size_ += VarintLength(entry_count);
148 // total bytes needed to store these entries' in-file offsets.
149 sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
150 }
151 }
152
153 Slice PlainTableIndexBuilder::FillIndexes(
154 const std::vector<IndexRecord*>& hash_to_offsets,
155 const std::vector<uint32_t>& entries_per_bucket) {
156 ROCKS_LOG_DEBUG(ioptions_.info_log,
157 "Reserving %" PRIu32 " bytes for plain table's sub_index",
158 sub_index_size_);
159 auto total_allocate_size = GetTotalSize();
160 char* allocated = arena_->AllocateAligned(
161 total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
162
163 auto temp_ptr = EncodeVarint32(allocated, index_size_);
164 uint32_t* index =
165 reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
166 char* sub_index = reinterpret_cast<char*>(index + index_size_);
167
168 uint32_t sub_index_offset = 0;
169 for (uint32_t i = 0; i < index_size_; i++) {
170 uint32_t num_keys_for_bucket = entries_per_bucket[i];
171 switch (num_keys_for_bucket) {
172 case 0:
173 // No key for bucket
174 PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
175 break;
176 case 1:
177 // point directly to the file offset
178 PutUnaligned(index + i, hash_to_offsets[i]->offset);
179 break;
180 default:
181 // point to second level indexes.
182 PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
183 char* prev_ptr = &sub_index[sub_index_offset];
184 char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
185 sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
186 char* sub_index_pos = &sub_index[sub_index_offset];
187 IndexRecord* record = hash_to_offsets[i];
188 int j;
189 for (j = num_keys_for_bucket - 1; j >= 0 && record;
190 j--, record = record->next) {
191 EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
192 }
193 assert(j == -1 && record == nullptr);
194 sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
195 assert(sub_index_offset <= sub_index_size_);
196 break;
197 }
198 }
199 assert(sub_index_offset == sub_index_size_);
200
201 ROCKS_LOG_DEBUG(ioptions_.info_log,
202 "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
203 index_size_, sub_index_size_);
204 return Slice(allocated, GetTotalSize());
205 }
206
207 const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
208 "PlainTableIndexBlock";
209 }; // namespace ROCKSDB_NAMESPACE
210
211 #endif // ROCKSDB_LITE