]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain_table_index.cc
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / plain_table_index.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#ifndef ROCKSDB_LITE
7
8#ifndef __STDC_FORMAT_MACROS
9#define __STDC_FORMAT_MACROS
10#endif
11
12#include <inttypes.h>
13
14#include "table/plain_table_index.h"
15#include "util/coding.h"
16#include "util/hash.h"
17
18namespace rocksdb {
19
20namespace {
21inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
22 assert(num_buckets > 0);
23 return hash % num_buckets;
24}
25}
26
27Status PlainTableIndex::InitFromRawData(Slice data) {
28 if (!GetVarint32(&data, &index_size_)) {
29 return Status::Corruption("Couldn't read the index size!");
30 }
31 assert(index_size_ > 0);
32 if (!GetVarint32(&data, &num_prefixes_)) {
33 return Status::Corruption("Couldn't read the index size!");
34 }
35 sub_index_size_ =
36 static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
37
38 char* index_data_begin = const_cast<char*>(data.data());
39 index_ = reinterpret_cast<uint32_t*>(index_data_begin);
40 sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
41 return Status::OK();
42}
43
44PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
45 uint32_t prefix_hash, uint32_t* bucket_value) const {
46 int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
47 GetUnaligned(index_ + bucket, bucket_value);
48 if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
49 *bucket_value ^= kSubIndexMask;
50 return kSubindex;
51 }
52 if (*bucket_value >= kMaxFileSize) {
53 return kNoPrefixForBucket;
54 } else {
55 // point directly to the file
56 return kDirectToFile;
57 }
58}
59
60void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
61 uint32_t offset) {
62 if (num_records_in_current_group_ == kNumRecordsPerGroup) {
63 current_group_ = AllocateNewGroup();
64 num_records_in_current_group_ = 0;
65 }
66 auto& new_record = current_group_[num_records_in_current_group_++];
67 new_record.hash = hash;
68 new_record.offset = offset;
69 new_record.next = nullptr;
70}
71
72void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
73 uint32_t key_offset) {
74 if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
75 ++num_prefixes_;
76 if (!is_first_record_) {
77 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
78 }
79 num_keys_per_prefix_ = 0;
80 prev_key_prefix_ = key_prefix_slice.ToString();
81 prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
82 due_index_ = true;
83 }
84
85 if (due_index_) {
86 // Add an index key for every kIndexIntervalForSamePrefixKeys keys
87 record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
88 due_index_ = false;
89 }
90
91 num_keys_per_prefix_++;
92 if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
93 due_index_ = true;
94 }
95 is_first_record_ = false;
96}
97
98Slice PlainTableIndexBuilder::Finish() {
99 AllocateIndex();
100 std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
101 std::vector<uint32_t> entries_per_bucket(index_size_, 0);
102 BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
103
104 keys_per_prefix_hist_.Add(num_keys_per_prefix_);
105 ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
106 keys_per_prefix_hist_.ToString().c_str());
107
108 // From the temp data structure, populate indexes.
109 return FillIndexes(hash_to_offsets, entries_per_bucket);
110}
111
112void PlainTableIndexBuilder::AllocateIndex() {
113 if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
114 // Fall back to pure binary search if the user fails to specify a prefix
115 // extractor.
116 index_size_ = 1;
117 } else {
118 double hash_table_size_multipier = 1.0 / hash_table_ratio_;
119 index_size_ =
120 static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
121 assert(index_size_ > 0);
122 }
123}
124
125void PlainTableIndexBuilder::BucketizeIndexes(
126 std::vector<IndexRecord*>* hash_to_offsets,
127 std::vector<uint32_t>* entries_per_bucket) {
128 bool first = true;
129 uint32_t prev_hash = 0;
130 size_t num_records = record_list_.GetNumRecords();
131 for (size_t i = 0; i < num_records; i++) {
132 IndexRecord* index_record = record_list_.At(i);
133 uint32_t cur_hash = index_record->hash;
134 if (first || prev_hash != cur_hash) {
135 prev_hash = cur_hash;
136 first = false;
137 }
138 uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
139 IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
140 index_record->next = prev_bucket_head;
141 (*hash_to_offsets)[bucket] = index_record;
142 (*entries_per_bucket)[bucket]++;
143 }
144
145 sub_index_size_ = 0;
146 for (auto entry_count : *entries_per_bucket) {
147 if (entry_count <= 1) {
148 continue;
149 }
150 // Only buckets with more than 1 entry will have subindex.
151 sub_index_size_ += VarintLength(entry_count);
152 // total bytes needed to store these entries' in-file offsets.
153 sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
154 }
155}
156
157Slice PlainTableIndexBuilder::FillIndexes(
158 const std::vector<IndexRecord*>& hash_to_offsets,
159 const std::vector<uint32_t>& entries_per_bucket) {
160 ROCKS_LOG_DEBUG(ioptions_.info_log,
161 "Reserving %" PRIu32 " bytes for plain table's sub_index",
162 sub_index_size_);
163 auto total_allocate_size = GetTotalSize();
164 char* allocated = arena_->AllocateAligned(
165 total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
166
167 auto temp_ptr = EncodeVarint32(allocated, index_size_);
168 uint32_t* index =
169 reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
170 char* sub_index = reinterpret_cast<char*>(index + index_size_);
171
172 uint32_t sub_index_offset = 0;
173 for (uint32_t i = 0; i < index_size_; i++) {
174 uint32_t num_keys_for_bucket = entries_per_bucket[i];
175 switch (num_keys_for_bucket) {
176 case 0:
177 // No key for bucket
178 PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
179 break;
180 case 1:
181 // point directly to the file offset
182 PutUnaligned(index + i, hash_to_offsets[i]->offset);
183 break;
184 default:
185 // point to second level indexes.
186 PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
187 char* prev_ptr = &sub_index[sub_index_offset];
188 char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
189 sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
190 char* sub_index_pos = &sub_index[sub_index_offset];
191 IndexRecord* record = hash_to_offsets[i];
192 int j;
193 for (j = num_keys_for_bucket - 1; j >= 0 && record;
194 j--, record = record->next) {
195 EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
196 }
197 assert(j == -1 && record == nullptr);
198 sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
199 assert(sub_index_offset <= sub_index_size_);
200 break;
201 }
202 }
203 assert(sub_index_offset == sub_index_size_);
204
205 ROCKS_LOG_DEBUG(ioptions_.info_log,
494da23a 206 "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
7c673cae
FG
207 index_size_, sub_index_size_);
208 return Slice(allocated, GetTotalSize());
209}
210
211const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
212 "PlainTableIndexBlock";
213}; // namespace rocksdb
214
215#endif // ROCKSDB_LITE