]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/index_builder.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / index_builder.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
f67539c2
TL
10#include "table/block_based/index_builder.h"
11
7c673cae 12#include <assert.h>
7c673cae 13
1e59de90 14#include <cinttypes>
7c673cae
FG
15#include <list>
16#include <string>
17
1e59de90 18#include "db/dbformat.h"
7c673cae
FG
19#include "rocksdb/comparator.h"
20#include "rocksdb/flush_block_policy.h"
f67539c2 21#include "table/block_based/partitioned_filter_block.h"
7c673cae 22#include "table/format.h"
7c673cae 23
f67539c2 24namespace ROCKSDB_NAMESPACE {
1e59de90 25
7c673cae
FG
26// Create a index builder based on its type.
27IndexBuilder* IndexBuilder::CreateIndexBuilder(
28 BlockBasedTableOptions::IndexType index_type,
29 const InternalKeyComparator* comparator,
30 const InternalKeySliceTransform* int_key_slice_transform,
11fdf7f2 31 const bool use_value_delta_encoding,
7c673cae 32 const BlockBasedTableOptions& table_opt) {
11fdf7f2 33 IndexBuilder* result = nullptr;
7c673cae
FG
34 switch (index_type) {
35 case BlockBasedTableOptions::kBinarySearch: {
11fdf7f2
TL
36 result = new ShortenedIndexBuilder(
37 comparator, table_opt.index_block_restart_interval,
f67539c2
TL
38 table_opt.format_version, use_value_delta_encoding,
39 table_opt.index_shortening, /* include_first_key */ false);
20effc67
TL
40 break;
41 }
7c673cae 42 case BlockBasedTableOptions::kHashSearch: {
f67539c2
TL
43 // Currently kHashSearch is incompatible with index_block_restart_interval
44 // > 1
45 assert(table_opt.index_block_restart_interval == 1);
46 result = new HashIndexBuilder(
47 comparator, int_key_slice_transform,
48 table_opt.index_block_restart_interval, table_opt.format_version,
49 use_value_delta_encoding, table_opt.index_shortening);
20effc67
TL
50 break;
51 }
7c673cae 52 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
11fdf7f2
TL
53 result = PartitionedIndexBuilder::CreateIndexBuilder(
54 comparator, use_value_delta_encoding, table_opt);
20effc67
TL
55 break;
56 }
f67539c2
TL
57 case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
58 result = new ShortenedIndexBuilder(
59 comparator, table_opt.index_block_restart_interval,
60 table_opt.format_version, use_value_delta_encoding,
61 table_opt.index_shortening, /* include_first_key */ true);
20effc67
TL
62 break;
63 }
7c673cae
FG
64 default: {
65 assert(!"Do not recognize the index type ");
20effc67
TL
66 break;
67 }
7c673cae 68 }
11fdf7f2 69 return result;
7c673cae
FG
70}
71
1e59de90
TL
72void ShortenedIndexBuilder::FindShortestInternalKeySeparator(
73 const Comparator& comparator, std::string* start, const Slice& limit) {
74 // Attempt to shorten the user portion of the key
75 Slice user_start = ExtractUserKey(*start);
76 Slice user_limit = ExtractUserKey(limit);
77 std::string tmp(user_start.data(), user_start.size());
78 comparator.FindShortestSeparator(&tmp, user_limit);
79 if (tmp.size() <= user_start.size() &&
80 comparator.Compare(user_start, tmp) < 0) {
81 // User key has become shorter physically, but larger logically.
82 // Tack on the earliest possible number to the shortened user key.
83 PutFixed64(&tmp,
84 PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
85 assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0);
86 assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0);
87 start->swap(tmp);
88 }
89}
90
91void ShortenedIndexBuilder::FindShortInternalKeySuccessor(
92 const Comparator& comparator, std::string* key) {
93 Slice user_key = ExtractUserKey(*key);
94 std::string tmp(user_key.data(), user_key.size());
95 comparator.FindShortSuccessor(&tmp);
96 if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) {
97 // User key has become shorter physically, but larger logically.
98 // Tack on the earliest possible number to the shortened user key.
99 PutFixed64(&tmp,
100 PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
101 assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0);
102 key->swap(tmp);
103 }
104}
105
7c673cae
FG
106PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
107 const InternalKeyComparator* comparator,
11fdf7f2 108 const bool use_value_delta_encoding,
7c673cae 109 const BlockBasedTableOptions& table_opt) {
11fdf7f2
TL
110 return new PartitionedIndexBuilder(comparator, table_opt,
111 use_value_delta_encoding);
7c673cae
FG
112}
113
114PartitionedIndexBuilder::PartitionedIndexBuilder(
115 const InternalKeyComparator* comparator,
11fdf7f2
TL
116 const BlockBasedTableOptions& table_opt,
117 const bool use_value_delta_encoding)
7c673cae 118 : IndexBuilder(comparator),
11fdf7f2
TL
119 index_block_builder_(table_opt.index_block_restart_interval,
120 true /*use_delta_encoding*/,
121 use_value_delta_encoding),
122 index_block_builder_without_seq_(table_opt.index_block_restart_interval,
123 true /*use_delta_encoding*/,
124 use_value_delta_encoding),
7c673cae 125 sub_index_builder_(nullptr),
11fdf7f2
TL
126 table_opt_(table_opt),
127 // We start by false. After each partition we revise the value based on
128 // what the sub_index_builder has decided. If the feature is disabled
129 // entirely, this will be set to true after switching the first
130 // sub_index_builder. Otherwise, it could be set to true even one of the
131 // sub_index_builders could not safely exclude seq from the keys, then it
132 // wil be enforced on all sub_index_builders on ::Finish.
133 seperator_is_key_plus_seq_(false),
134 use_value_delta_encoding_(use_value_delta_encoding) {}
7c673cae
FG
135
136PartitionedIndexBuilder::~PartitionedIndexBuilder() {
137 delete sub_index_builder_;
138}
139
140void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
141 assert(sub_index_builder_ == nullptr);
142 sub_index_builder_ = new ShortenedIndexBuilder(
11fdf7f2 143 comparator_, table_opt_.index_block_restart_interval,
f67539c2
TL
144 table_opt_.format_version, use_value_delta_encoding_,
145 table_opt_.index_shortening, /* include_first_key */ false);
20effc67
TL
146
147 // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
148 // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
149 // default on Creation) so that flush policy can point to
150 // sub_index_builder_->index_block_builder_
151 if (seperator_is_key_plus_seq_) {
152 sub_index_builder_->seperator_is_key_plus_seq_ = true;
153 }
154
7c673cae
FG
155 flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
156 table_opt_.metadata_block_size, table_opt_.block_size_deviation,
11fdf7f2
TL
157 // Note: this is sub-optimal since sub_index_builder_ could later reset
158 // seperator_is_key_plus_seq_ but the probability of that is low.
159 sub_index_builder_->seperator_is_key_plus_seq_
160 ? sub_index_builder_->index_block_builder_
161 : sub_index_builder_->index_block_builder_without_seq_));
162 partition_cut_requested_ = false;
163}
164
165void PartitionedIndexBuilder::RequestPartitionCut() {
166 partition_cut_requested_ = true;
7c673cae
FG
167}
168
169void PartitionedIndexBuilder::AddIndexEntry(
170 std::string* last_key_in_current_block,
171 const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
172 // Note: to avoid two consecuitive flush in the same method call, we do not
173 // check flush policy when adding the last key
174 if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
175 if (sub_index_builder_ == nullptr) {
176 MakeNewSubIndexBuilder();
177 }
178 sub_index_builder_->AddIndexEntry(last_key_in_current_block,
179 first_key_in_next_block, block_handle);
20effc67
TL
180 if (!seperator_is_key_plus_seq_ &&
181 sub_index_builder_->seperator_is_key_plus_seq_) {
182 // then we need to apply it to all sub-index builders and reset
183 // flush_policy to point to Block Builder of sub_index_builder_ that store
184 // internal keys.
11fdf7f2 185 seperator_is_key_plus_seq_ = true;
20effc67
TL
186 flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
187 table_opt_.metadata_block_size, table_opt_.block_size_deviation,
188 sub_index_builder_->index_block_builder_));
11fdf7f2 189 }
7c673cae
FG
190 sub_index_last_key_ = std::string(*last_key_in_current_block);
191 entries_.push_back(
192 {sub_index_last_key_,
193 std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
194 sub_index_builder_ = nullptr;
195 cut_filter_block = true;
196 } else {
197 // apply flush policy only to non-empty sub_index_builder_
198 if (sub_index_builder_ != nullptr) {
199 std::string handle_encoding;
200 block_handle.EncodeTo(&handle_encoding);
201 bool do_flush =
11fdf7f2 202 partition_cut_requested_ ||
7c673cae
FG
203 flush_policy_->Update(*last_key_in_current_block, handle_encoding);
204 if (do_flush) {
205 entries_.push_back(
206 {sub_index_last_key_,
207 std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
208 cut_filter_block = true;
209 sub_index_builder_ = nullptr;
210 }
211 }
212 if (sub_index_builder_ == nullptr) {
213 MakeNewSubIndexBuilder();
214 }
215 sub_index_builder_->AddIndexEntry(last_key_in_current_block,
216 first_key_in_next_block, block_handle);
217 sub_index_last_key_ = std::string(*last_key_in_current_block);
20effc67
TL
218 if (!seperator_is_key_plus_seq_ &&
219 sub_index_builder_->seperator_is_key_plus_seq_) {
220 // then we need to apply it to all sub-index builders and reset
221 // flush_policy to point to Block Builder of sub_index_builder_ that store
222 // internal keys.
11fdf7f2 223 seperator_is_key_plus_seq_ = true;
20effc67
TL
224 flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
225 table_opt_.metadata_block_size, table_opt_.block_size_deviation,
226 sub_index_builder_->index_block_builder_));
11fdf7f2 227 }
7c673cae
FG
228 }
229}
230
231Status PartitionedIndexBuilder::Finish(
232 IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
11fdf7f2
TL
233 if (partition_cnt_ == 0) {
234 partition_cnt_ = entries_.size();
235 }
7c673cae
FG
236 // It must be set to null after last key is added
237 assert(sub_index_builder_ == nullptr);
238 if (finishing_indexes == true) {
239 Entry& last_entry = entries_.front();
240 std::string handle_encoding;
241 last_partition_block_handle.EncodeTo(&handle_encoding);
11fdf7f2
TL
242 std::string handle_delta_encoding;
243 PutVarsignedint64(
244 &handle_delta_encoding,
245 last_partition_block_handle.size() - last_encoded_handle_.size());
246 last_encoded_handle_ = last_partition_block_handle;
247 const Slice handle_delta_encoding_slice(handle_delta_encoding);
248 index_block_builder_.Add(last_entry.key, handle_encoding,
249 &handle_delta_encoding_slice);
250 if (!seperator_is_key_plus_seq_) {
251 index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
252 handle_encoding,
253 &handle_delta_encoding_slice);
254 }
7c673cae
FG
255 entries_.pop_front();
256 }
257 // If there is no sub_index left, then return the 2nd level index.
258 if (UNLIKELY(entries_.empty())) {
11fdf7f2
TL
259 if (seperator_is_key_plus_seq_) {
260 index_blocks->index_block_contents = index_block_builder_.Finish();
261 } else {
262 index_blocks->index_block_contents =
263 index_block_builder_without_seq_.Finish();
264 }
265 top_level_index_size_ = index_blocks->index_block_contents.size();
266 index_size_ += top_level_index_size_;
7c673cae
FG
267 return Status::OK();
268 } else {
269 // Finish the next partition index in line and Incomplete() to indicate we
270 // expect more calls to Finish
271 Entry& entry = entries_.front();
11fdf7f2
TL
272 // Apply the policy to all sub-indexes
273 entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
7c673cae 274 auto s = entry.value->Finish(index_blocks);
11fdf7f2 275 index_size_ += index_blocks->index_block_contents.size();
7c673cae
FG
276 finishing_indexes = true;
277 return s.ok() ? Status::Incomplete() : s;
278 }
279}
280
11fdf7f2 281size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
f67539c2 282} // namespace ROCKSDB_NAMESPACE