]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/index_builder.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / index_builder.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <assert.h>
13#include <inttypes.h>
14
15#include <list>
16#include <string>
17#include <unordered_map>
18
19#include "rocksdb/comparator.h"
20#include "table/block_based_table_factory.h"
21#include "table/block_builder.h"
22#include "table/format.h"
23
24namespace rocksdb {
25// The interface for building index.
26// Instruction for adding a new concrete IndexBuilder:
27// 1. Create a subclass instantiated from IndexBuilder.
28// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
29// 3. Add a create function for the new subclass in CreateIndexBuilder.
30// Note: we can devise more advanced design to simplify the process for adding
31// new subclass, which will, on the other hand, increase the code complexity and
32// catch unwanted attention from readers. Given that we won't add/change
33// indexes frequently, it makes sense to just embrace a more straightforward
34// design that just works.
35class IndexBuilder {
36 public:
37 static IndexBuilder* CreateIndexBuilder(
38 BlockBasedTableOptions::IndexType index_type,
39 const rocksdb::InternalKeyComparator* comparator,
40 const InternalKeySliceTransform* int_key_slice_transform,
11fdf7f2 41 const bool use_value_delta_encoding,
7c673cae
FG
42 const BlockBasedTableOptions& table_opt);
43
44 // Index builder will construct a set of blocks which contain:
45 // 1. One primary index block.
46 // 2. (Optional) a set of metablocks that contains the metadata of the
47 // primary index.
48 struct IndexBlocks {
49 Slice index_block_contents;
50 std::unordered_map<std::string, Slice> meta_blocks;
51 };
52 explicit IndexBuilder(const InternalKeyComparator* comparator)
53 : comparator_(comparator) {}
54
55 virtual ~IndexBuilder() {}
56
57 // Add a new index entry to index block.
58 // To allow further optimization, we provide `last_key_in_current_block` and
59 // `first_key_in_next_block`, based on which the specific implementation can
60 // determine the best index key to be used for the index block.
61 // @last_key_in_current_block: this parameter maybe overridden with the value
62 // "substitute key".
63 // @first_key_in_next_block: it will be nullptr if the entry being added is
64 // the last one in the table
65 //
66 // REQUIRES: Finish() has not yet been called.
67 virtual void AddIndexEntry(std::string* last_key_in_current_block,
68 const Slice* first_key_in_next_block,
69 const BlockHandle& block_handle) = 0;
70
71 // This method will be called whenever a key is added. The subclasses may
72 // override OnKeyAdded() if they need to collect additional information.
11fdf7f2 73 virtual void OnKeyAdded(const Slice& /*key*/) {}
7c673cae
FG
74
75 // Inform the index builder that all entries has been written. Block builder
76 // may therefore perform any operation required for block finalization.
77 //
78 // REQUIRES: Finish() has not yet been called.
79 inline Status Finish(IndexBlocks* index_blocks) {
80 // Throw away the changes to last_partition_block_handle. It has no effect
81 // on the first call to Finish anyway.
82 BlockHandle last_partition_block_handle;
83 return Finish(index_blocks, last_partition_block_handle);
84 }
85
86 // This override of Finish can be utilized to build the 2nd level index in
87 // PartitionIndexBuilder.
88 //
89 // index_blocks will be filled with the resulting index data. If the return
90 // value is Status::InComplete() then it means that the index is partitioned
91 // and the callee should keep calling Finish until Status::OK() is returned.
92 // In that case, last_partition_block_handle is pointer to the block written
93 // with the result of the last call to Finish. This can be utilized to build
94 // the second level index pointing to each block of partitioned indexes. The
95 // last call to Finish() that returns Status::OK() populates index_blocks with
96 // the 2nd level index content.
97 virtual Status Finish(IndexBlocks* index_blocks,
98 const BlockHandle& last_partition_block_handle) = 0;
99
11fdf7f2
TL
100 // Get the size for index block. Must be called after ::Finish.
101 virtual size_t IndexSize() const = 0;
102
103 virtual bool seperator_is_key_plus_seq() { return true; }
7c673cae
FG
104
105 protected:
106 const InternalKeyComparator* comparator_;
11fdf7f2
TL
107 // Set after ::Finish is called
108 size_t index_size_ = 0;
7c673cae
FG
109};
110
111// This index builder builds space-efficient index block.
112//
113// Optimizations:
114// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
115// search when doing index lookup (can be disabled by setting
116// index_block_restart_interval).
117// 2. Shorten the key length for index block. Other than honestly using the
118// last key in the data block as the index key, we instead find a shortest
119// substitute key that serves the same function.
120class ShortenedIndexBuilder : public IndexBuilder {
121 public:
122 explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
11fdf7f2
TL
123 const int index_block_restart_interval,
124 const uint32_t format_version,
125 const bool use_value_delta_encoding)
7c673cae 126 : IndexBuilder(comparator),
11fdf7f2
TL
127 index_block_builder_(index_block_restart_interval,
128 true /*use_delta_encoding*/,
129 use_value_delta_encoding),
130 index_block_builder_without_seq_(index_block_restart_interval,
131 true /*use_delta_encoding*/,
132 use_value_delta_encoding) {
133 // Making the default true will disable the feature for old versions
134 seperator_is_key_plus_seq_ = (format_version <= 2);
135 }
7c673cae
FG
136
137 virtual void AddIndexEntry(std::string* last_key_in_current_block,
138 const Slice* first_key_in_next_block,
139 const BlockHandle& block_handle) override {
140 if (first_key_in_next_block != nullptr) {
141 comparator_->FindShortestSeparator(last_key_in_current_block,
142 *first_key_in_next_block);
11fdf7f2
TL
143 if (!seperator_is_key_plus_seq_ &&
144 comparator_->user_comparator()->Compare(
145 ExtractUserKey(*last_key_in_current_block),
146 ExtractUserKey(*first_key_in_next_block)) == 0) {
147 seperator_is_key_plus_seq_ = true;
148 }
7c673cae
FG
149 } else {
150 comparator_->FindShortSuccessor(last_key_in_current_block);
151 }
11fdf7f2 152 auto sep = Slice(*last_key_in_current_block);
7c673cae
FG
153
154 std::string handle_encoding;
155 block_handle.EncodeTo(&handle_encoding);
11fdf7f2
TL
156 std::string handle_delta_encoding;
157 PutVarsignedint64(&handle_delta_encoding,
158 block_handle.size() - last_encoded_handle_.size());
159 assert(handle_delta_encoding.size() != 0);
160 last_encoded_handle_ = block_handle;
161 const Slice handle_delta_encoding_slice(handle_delta_encoding);
162 index_block_builder_.Add(sep, handle_encoding,
163 &handle_delta_encoding_slice);
164 if (!seperator_is_key_plus_seq_) {
165 index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
166 &handle_delta_encoding_slice);
167 }
7c673cae
FG
168 }
169
170 using IndexBuilder::Finish;
171 virtual Status Finish(
172 IndexBlocks* index_blocks,
11fdf7f2
TL
173 const BlockHandle& /*last_partition_block_handle*/) override {
174 if (seperator_is_key_plus_seq_) {
175 index_blocks->index_block_contents = index_block_builder_.Finish();
176 } else {
177 index_blocks->index_block_contents =
178 index_block_builder_without_seq_.Finish();
179 }
180 index_size_ = index_blocks->index_block_contents.size();
7c673cae
FG
181 return Status::OK();
182 }
183
11fdf7f2
TL
184 virtual size_t IndexSize() const override { return index_size_; }
185
186 virtual bool seperator_is_key_plus_seq() override {
187 return seperator_is_key_plus_seq_;
7c673cae
FG
188 }
189
190 friend class PartitionedIndexBuilder;
191
192 private:
193 BlockBuilder index_block_builder_;
11fdf7f2
TL
194 BlockBuilder index_block_builder_without_seq_;
195 bool seperator_is_key_plus_seq_;
196 BlockHandle last_encoded_handle_;
7c673cae
FG
197};
198
199// HashIndexBuilder contains a binary-searchable primary index and the
200// metadata for secondary hash index construction.
201// The metadata for hash index consists two parts:
202// - a metablock that compactly contains a sequence of prefixes. All prefixes
203// are stored consectively without any metadata (like, prefix sizes) being
204// stored, which is kept in the other metablock.
205// - a metablock contains the metadata of the prefixes, including prefix size,
206// restart index and number of block it spans. The format looks like:
207//
208// +-----------------+---------------------------+---------------------+
209// <=prefix 1
210// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
211// +-----------------+---------------------------+---------------------+
212// <=prefix 2
213// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
214// +-----------------+---------------------------+---------------------+
215// | |
216// | .... |
217// | |
218// +-----------------+---------------------------+---------------------+
219// <=prefix n
220// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
221// +-----------------+---------------------------+---------------------+
222//
223// The reason of separating these two metablocks is to enable the efficiently
224// reuse the first metablock during hash index construction without unnecessary
225// data copy or small heap allocations for prefixes.
226class HashIndexBuilder : public IndexBuilder {
227 public:
228 explicit HashIndexBuilder(const InternalKeyComparator* comparator,
229 const SliceTransform* hash_key_extractor,
11fdf7f2
TL
230 int index_block_restart_interval,
231 int format_version, bool use_value_delta_encoding)
7c673cae 232 : IndexBuilder(comparator),
11fdf7f2
TL
233 primary_index_builder_(comparator, index_block_restart_interval,
234 format_version, use_value_delta_encoding),
7c673cae
FG
235 hash_key_extractor_(hash_key_extractor) {}
236
237 virtual void AddIndexEntry(std::string* last_key_in_current_block,
238 const Slice* first_key_in_next_block,
239 const BlockHandle& block_handle) override {
240 ++current_restart_index_;
241 primary_index_builder_.AddIndexEntry(last_key_in_current_block,
242 first_key_in_next_block, block_handle);
243 }
244
245 virtual void OnKeyAdded(const Slice& key) override {
246 auto key_prefix = hash_key_extractor_->Transform(key);
247 bool is_first_entry = pending_block_num_ == 0;
248
249 // Keys may share the prefix
250 if (is_first_entry || pending_entry_prefix_ != key_prefix) {
251 if (!is_first_entry) {
252 FlushPendingPrefix();
253 }
254
255 // need a hard copy otherwise the underlying data changes all the time.
256 // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
257 // copy.
258 pending_entry_prefix_ = key_prefix.ToString();
259 pending_block_num_ = 1;
260 pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
261 } else {
262 // entry number increments when keys share the prefix reside in
263 // different data blocks.
264 auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
265 assert(last_restart_index <= current_restart_index_);
266 if (last_restart_index != current_restart_index_) {
267 ++pending_block_num_;
268 }
269 }
270 }
271
272 virtual Status Finish(
273 IndexBlocks* index_blocks,
274 const BlockHandle& last_partition_block_handle) override {
11fdf7f2
TL
275 if (pending_block_num_ != 0) {
276 FlushPendingPrefix();
277 }
7c673cae
FG
278 primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
279 index_blocks->meta_blocks.insert(
280 {kHashIndexPrefixesBlock.c_str(), prefix_block_});
281 index_blocks->meta_blocks.insert(
282 {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
283 return Status::OK();
284 }
285
11fdf7f2
TL
286 virtual size_t IndexSize() const override {
287 return primary_index_builder_.IndexSize() + prefix_block_.size() +
7c673cae
FG
288 prefix_meta_block_.size();
289 }
290
11fdf7f2
TL
291 virtual bool seperator_is_key_plus_seq() override {
292 return primary_index_builder_.seperator_is_key_plus_seq();
293 }
294
7c673cae
FG
295 private:
296 void FlushPendingPrefix() {
297 prefix_block_.append(pending_entry_prefix_.data(),
298 pending_entry_prefix_.size());
299 PutVarint32Varint32Varint32(
300 &prefix_meta_block_,
301 static_cast<uint32_t>(pending_entry_prefix_.size()),
302 pending_entry_index_, pending_block_num_);
303 }
304
305 ShortenedIndexBuilder primary_index_builder_;
306 const SliceTransform* hash_key_extractor_;
307
308 // stores a sequence of prefixes
309 std::string prefix_block_;
310 // stores the metadata of prefixes
311 std::string prefix_meta_block_;
312
313 // The following 3 variables keeps unflushed prefix and its metadata.
314 // The details of block_num and entry_index can be found in
315 // "block_hash_index.{h,cc}"
316 uint32_t pending_block_num_ = 0;
317 uint32_t pending_entry_index_ = 0;
318 std::string pending_entry_prefix_;
319
320 uint64_t current_restart_index_ = 0;
321};
322
323/**
324 * IndexBuilder for two-level indexing. Internally it creates a new index for
325 * each partition and Finish then in order when Finish is called on it
326 * continiously until Status::OK() is returned.
327 *
328 * The format on the disk would be I I I I I I IP where I is block containing a
329 * partition of indexes built using ShortenedIndexBuilder and IP is a block
330 * containing a secondary index on the partitions, built using
331 * ShortenedIndexBuilder.
332 */
333class PartitionedIndexBuilder : public IndexBuilder {
334 public:
335 static PartitionedIndexBuilder* CreateIndexBuilder(
336 const rocksdb::InternalKeyComparator* comparator,
11fdf7f2 337 const bool use_value_delta_encoding,
7c673cae
FG
338 const BlockBasedTableOptions& table_opt);
339
340 explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
11fdf7f2
TL
341 const BlockBasedTableOptions& table_opt,
342 const bool use_value_delta_encoding);
7c673cae
FG
343
344 virtual ~PartitionedIndexBuilder();
345
346 virtual void AddIndexEntry(std::string* last_key_in_current_block,
347 const Slice* first_key_in_next_block,
348 const BlockHandle& block_handle) override;
349
350 virtual Status Finish(
351 IndexBlocks* index_blocks,
352 const BlockHandle& last_partition_block_handle) override;
353
11fdf7f2
TL
354 virtual size_t IndexSize() const override { return index_size_; }
355 size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
356 size_t NumPartitions() const;
7c673cae
FG
357
358 inline bool ShouldCutFilterBlock() {
359 // Current policy is to align the partitions of index and filters
360 if (cut_filter_block) {
361 cut_filter_block = false;
362 return true;
363 }
364 return false;
365 }
366
367 std::string& GetPartitionKey() { return sub_index_last_key_; }
368
11fdf7f2
TL
369 // Called when an external entity (such as filter partition builder) request
370 // cutting the next partition
371 void RequestPartitionCut();
372
373 virtual bool seperator_is_key_plus_seq() override {
374 return seperator_is_key_plus_seq_;
375 }
376
377 bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
378
7c673cae 379 private:
11fdf7f2
TL
380 // Set after ::Finish is called
381 size_t top_level_index_size_ = 0;
382 // Set after ::Finish is called
383 size_t partition_cnt_ = 0;
384
7c673cae
FG
385 void MakeNewSubIndexBuilder();
386
387 struct Entry {
388 std::string key;
389 std::unique_ptr<ShortenedIndexBuilder> value;
390 };
391 std::list<Entry> entries_; // list of partitioned indexes and their keys
392 BlockBuilder index_block_builder_; // top-level index builder
11fdf7f2 393 BlockBuilder index_block_builder_without_seq_; // same for user keys
7c673cae
FG
394 // the active partition index builder
395 ShortenedIndexBuilder* sub_index_builder_;
396 // the last key in the active partition index builder
397 std::string sub_index_last_key_;
398 std::unique_ptr<FlushBlockPolicy> flush_policy_;
399 // true if Finish is called once but not complete yet.
400 bool finishing_indexes = false;
401 const BlockBasedTableOptions& table_opt_;
11fdf7f2
TL
402 bool seperator_is_key_plus_seq_;
403 bool use_value_delta_encoding_;
404 // true if an external entity (such as filter partition builder) request
405 // cutting the next partition
406 bool partition_cut_requested_ = true;
7c673cae
FG
407 // true if it should cut the next filter partition block
408 bool cut_filter_block = false;
11fdf7f2 409 BlockHandle last_encoded_handle_;
7c673cae
FG
410};
411} // namespace rocksdb