]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_index.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_index.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#ifndef ROCKSDB_LITE
1e59de90 7#include "table/plain/plain_table_index.h"
7c673cae 8
f67539c2 9#include <cinttypes>
7c673cae 10
1e59de90 11#include "logging/logging.h"
7c673cae
FG
12#include "util/coding.h"
13#include "util/hash.h"
14
f67539c2 15namespace ROCKSDB_NAMESPACE {
7c673cae
FG
16
17namespace {
18inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
19 assert(num_buckets > 0);
20 return hash % num_buckets;
21}
1e59de90 22} // namespace
7c673cae
FG
23
24Status PlainTableIndex::InitFromRawData(Slice data) {
25 if (!GetVarint32(&data, &index_size_)) {
26 return Status::Corruption("Couldn't read the index size!");
27 }
28 assert(index_size_ > 0);
29 if (!GetVarint32(&data, &num_prefixes_)) {
30 return Status::Corruption("Couldn't read the index size!");
31 }
32 sub_index_size_ =
33 static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
34
35 char* index_data_begin = const_cast<char*>(data.data());
36 index_ = reinterpret_cast<uint32_t*>(index_data_begin);
37 sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
38 return Status::OK();
39}
40
41PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
42 uint32_t prefix_hash, uint32_t* bucket_value) const {
43 int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
44 GetUnaligned(index_ + bucket, bucket_value);
45 if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
46 *bucket_value ^= kSubIndexMask;
47 return kSubindex;
48 }
49 if (*bucket_value >= kMaxFileSize) {
50 return kNoPrefixForBucket;
51 } else {
52 // point directly to the file
53 return kDirectToFile;
54 }
55}
56
57void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
58 uint32_t offset) {
59 if (num_records_in_current_group_ == kNumRecordsPerGroup) {
60 current_group_ = AllocateNewGroup();
61 num_records_in_current_group_ = 0;
62 }
63 auto& new_record = current_group_[num_records_in_current_group_++];
64 new_record.hash = hash;
65 new_record.offset = offset;
66 new_record.next = nullptr;
67}
68
69void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
70 uint32_t key_offset) {
71 if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
72 ++num_prefixes_;
73 if (!is_first_record_) {
74 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
75 }
76 num_keys_per_prefix_ = 0;
77 prev_key_prefix_ = key_prefix_slice.ToString();
78 prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
79 due_index_ = true;
80 }
81
82 if (due_index_) {
83 // Add an index key for every kIndexIntervalForSamePrefixKeys keys
84 record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
85 due_index_ = false;
86 }
87
88 num_keys_per_prefix_++;
89 if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
90 due_index_ = true;
91 }
92 is_first_record_ = false;
93}
94
95Slice PlainTableIndexBuilder::Finish() {
96 AllocateIndex();
97 std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
98 std::vector<uint32_t> entries_per_bucket(index_size_, 0);
99 BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
100
101 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
1e59de90 102 ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
7c673cae
FG
103 keys_per_prefix_hist_.ToString().c_str());
104
105 // From the temp data structure, populate indexes.
106 return FillIndexes(hash_to_offsets, entries_per_bucket);
107}
108
109void PlainTableIndexBuilder::AllocateIndex() {
110 if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
111 // Fall back to pure binary search if the user fails to specify a prefix
112 // extractor.
113 index_size_ = 1;
114 } else {
115 double hash_table_size_multipier = 1.0 / hash_table_ratio_;
116 index_size_ =
1e59de90 117 static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
7c673cae
FG
118 assert(index_size_ > 0);
119 }
120}
121
122void PlainTableIndexBuilder::BucketizeIndexes(
123 std::vector<IndexRecord*>* hash_to_offsets,
124 std::vector<uint32_t>* entries_per_bucket) {
125 bool first = true;
126 uint32_t prev_hash = 0;
127 size_t num_records = record_list_.GetNumRecords();
128 for (size_t i = 0; i < num_records; i++) {
129 IndexRecord* index_record = record_list_.At(i);
130 uint32_t cur_hash = index_record->hash;
131 if (first || prev_hash != cur_hash) {
132 prev_hash = cur_hash;
133 first = false;
134 }
135 uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
136 IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
137 index_record->next = prev_bucket_head;
138 (*hash_to_offsets)[bucket] = index_record;
139 (*entries_per_bucket)[bucket]++;
140 }
141
142 sub_index_size_ = 0;
143 for (auto entry_count : *entries_per_bucket) {
144 if (entry_count <= 1) {
145 continue;
146 }
147 // Only buckets with more than 1 entry will have subindex.
148 sub_index_size_ += VarintLength(entry_count);
149 // total bytes needed to store these entries' in-file offsets.
150 sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
151 }
152}
153
154Slice PlainTableIndexBuilder::FillIndexes(
155 const std::vector<IndexRecord*>& hash_to_offsets,
156 const std::vector<uint32_t>& entries_per_bucket) {
1e59de90 157 ROCKS_LOG_DEBUG(ioptions_.logger,
7c673cae
FG
158 "Reserving %" PRIu32 " bytes for plain table's sub_index",
159 sub_index_size_);
160 auto total_allocate_size = GetTotalSize();
161 char* allocated = arena_->AllocateAligned(
1e59de90 162 total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
7c673cae
FG
163
164 auto temp_ptr = EncodeVarint32(allocated, index_size_);
165 uint32_t* index =
166 reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
167 char* sub_index = reinterpret_cast<char*>(index + index_size_);
168
169 uint32_t sub_index_offset = 0;
170 for (uint32_t i = 0; i < index_size_; i++) {
171 uint32_t num_keys_for_bucket = entries_per_bucket[i];
172 switch (num_keys_for_bucket) {
173 case 0:
174 // No key for bucket
175 PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
176 break;
177 case 1:
178 // point directly to the file offset
179 PutUnaligned(index + i, hash_to_offsets[i]->offset);
180 break;
181 default:
182 // point to second level indexes.
1e59de90
TL
183 PutUnaligned(index + i,
184 sub_index_offset | PlainTableIndex::kSubIndexMask);
7c673cae
FG
185 char* prev_ptr = &sub_index[sub_index_offset];
186 char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
187 sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
188 char* sub_index_pos = &sub_index[sub_index_offset];
189 IndexRecord* record = hash_to_offsets[i];
190 int j;
191 for (j = num_keys_for_bucket - 1; j >= 0 && record;
192 j--, record = record->next) {
193 EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
194 }
195 assert(j == -1 && record == nullptr);
196 sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
197 assert(sub_index_offset <= sub_index_size_);
198 break;
199 }
200 }
201 assert(sub_index_offset == sub_index_size_);
202
1e59de90 203 ROCKS_LOG_DEBUG(ioptions_.logger,
494da23a 204 "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
7c673cae
FG
205 index_size_, sub_index_size_);
206 return Slice(allocated, GetTotalSize());
207}
208
209const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
210 "PlainTableIndexBlock";
1e59de90 211} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
212
213#endif // ROCKSDB_LITE