]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/index_builder.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / index_builder.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <assert.h>
7c673cae 13
1e59de90 14#include <cinttypes>
7c673cae
FG
15#include <list>
16#include <string>
17#include <unordered_map>
18
19#include "rocksdb/comparator.h"
f67539c2
TL
20#include "table/block_based/block_based_table_factory.h"
21#include "table/block_based/block_builder.h"
7c673cae
FG
22#include "table/format.h"
23
f67539c2 24namespace ROCKSDB_NAMESPACE {
7c673cae
FG
25// The interface for building index.
26// Instruction for adding a new concrete IndexBuilder:
27// 1. Create a subclass instantiated from IndexBuilder.
28// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
29// 3. Add a create function for the new subclass in CreateIndexBuilder.
30// Note: we can devise more advanced design to simplify the process for adding
31// new subclass, which will, on the other hand, increase the code complexity and
32// catch unwanted attention from readers. Given that we won't add/change
33// indexes frequently, it makes sense to just embrace a more straightforward
34// design that just works.
35class IndexBuilder {
36 public:
37 static IndexBuilder* CreateIndexBuilder(
38 BlockBasedTableOptions::IndexType index_type,
f67539c2 39 const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
7c673cae 40 const InternalKeySliceTransform* int_key_slice_transform,
11fdf7f2 41 const bool use_value_delta_encoding,
7c673cae
FG
42 const BlockBasedTableOptions& table_opt);
43
44 // Index builder will construct a set of blocks which contain:
45 // 1. One primary index block.
46 // 2. (Optional) a set of metablocks that contains the metadata of the
47 // primary index.
48 struct IndexBlocks {
49 Slice index_block_contents;
50 std::unordered_map<std::string, Slice> meta_blocks;
51 };
52 explicit IndexBuilder(const InternalKeyComparator* comparator)
53 : comparator_(comparator) {}
54
55 virtual ~IndexBuilder() {}
56
57 // Add a new index entry to index block.
58 // To allow further optimization, we provide `last_key_in_current_block` and
59 // `first_key_in_next_block`, based on which the specific implementation can
60 // determine the best index key to be used for the index block.
f67539c2 61 // Called before the OnKeyAdded() call for first_key_in_next_block.
7c673cae
FG
62 // @last_key_in_current_block: this parameter maybe overridden with the value
63 // "substitute key".
64 // @first_key_in_next_block: it will be nullptr if the entry being added is
65 // the last one in the table
66 //
67 // REQUIRES: Finish() has not yet been called.
68 virtual void AddIndexEntry(std::string* last_key_in_current_block,
69 const Slice* first_key_in_next_block,
70 const BlockHandle& block_handle) = 0;
71
72 // This method will be called whenever a key is added. The subclasses may
73 // override OnKeyAdded() if they need to collect additional information.
11fdf7f2 74 virtual void OnKeyAdded(const Slice& /*key*/) {}
7c673cae
FG
75
76 // Inform the index builder that all entries has been written. Block builder
77 // may therefore perform any operation required for block finalization.
78 //
79 // REQUIRES: Finish() has not yet been called.
80 inline Status Finish(IndexBlocks* index_blocks) {
81 // Throw away the changes to last_partition_block_handle. It has no effect
82 // on the first call to Finish anyway.
83 BlockHandle last_partition_block_handle;
84 return Finish(index_blocks, last_partition_block_handle);
85 }
86
87 // This override of Finish can be utilized to build the 2nd level index in
88 // PartitionIndexBuilder.
89 //
90 // index_blocks will be filled with the resulting index data. If the return
91 // value is Status::InComplete() then it means that the index is partitioned
92 // and the callee should keep calling Finish until Status::OK() is returned.
93 // In that case, last_partition_block_handle is pointer to the block written
94 // with the result of the last call to Finish. This can be utilized to build
95 // the second level index pointing to each block of partitioned indexes. The
96 // last call to Finish() that returns Status::OK() populates index_blocks with
97 // the 2nd level index content.
98 virtual Status Finish(IndexBlocks* index_blocks,
99 const BlockHandle& last_partition_block_handle) = 0;
100
11fdf7f2
TL
101 // Get the size for index block. Must be called after ::Finish.
102 virtual size_t IndexSize() const = 0;
103
104 virtual bool seperator_is_key_plus_seq() { return true; }
7c673cae
FG
105
106 protected:
107 const InternalKeyComparator* comparator_;
11fdf7f2
TL
108 // Set after ::Finish is called
109 size_t index_size_ = 0;
7c673cae
FG
110};
111
112// This index builder builds space-efficient index block.
113//
114// Optimizations:
115// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
116// search when doing index lookup (can be disabled by setting
117// index_block_restart_interval).
118// 2. Shorten the key length for index block. Other than honestly using the
119// last key in the data block as the index key, we instead find a shortest
120// substitute key that serves the same function.
121class ShortenedIndexBuilder : public IndexBuilder {
122 public:
f67539c2
TL
123 explicit ShortenedIndexBuilder(
124 const InternalKeyComparator* comparator,
125 const int index_block_restart_interval, const uint32_t format_version,
126 const bool use_value_delta_encoding,
127 BlockBasedTableOptions::IndexShorteningMode shortening_mode,
128 bool include_first_key)
7c673cae 129 : IndexBuilder(comparator),
11fdf7f2
TL
130 index_block_builder_(index_block_restart_interval,
131 true /*use_delta_encoding*/,
132 use_value_delta_encoding),
133 index_block_builder_without_seq_(index_block_restart_interval,
134 true /*use_delta_encoding*/,
f67539c2
TL
135 use_value_delta_encoding),
136 use_value_delta_encoding_(use_value_delta_encoding),
137 include_first_key_(include_first_key),
138 shortening_mode_(shortening_mode) {
11fdf7f2
TL
139 // Making the default true will disable the feature for old versions
140 seperator_is_key_plus_seq_ = (format_version <= 2);
141 }
7c673cae 142
f67539c2
TL
143 virtual void OnKeyAdded(const Slice& key) override {
144 if (include_first_key_ && current_block_first_internal_key_.empty()) {
145 current_block_first_internal_key_.assign(key.data(), key.size());
146 }
147 }
148
7c673cae
FG
149 virtual void AddIndexEntry(std::string* last_key_in_current_block,
150 const Slice* first_key_in_next_block,
151 const BlockHandle& block_handle) override {
152 if (first_key_in_next_block != nullptr) {
f67539c2
TL
153 if (shortening_mode_ !=
154 BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
1e59de90
TL
155 FindShortestInternalKeySeparator(*comparator_->user_comparator(),
156 last_key_in_current_block,
157 *first_key_in_next_block);
f67539c2 158 }
11fdf7f2
TL
159 if (!seperator_is_key_plus_seq_ &&
160 comparator_->user_comparator()->Compare(
161 ExtractUserKey(*last_key_in_current_block),
162 ExtractUserKey(*first_key_in_next_block)) == 0) {
163 seperator_is_key_plus_seq_ = true;
164 }
7c673cae 165 } else {
f67539c2
TL
166 if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
167 kShortenSeparatorsAndSuccessor) {
1e59de90
TL
168 FindShortInternalKeySuccessor(*comparator_->user_comparator(),
169 last_key_in_current_block);
f67539c2 170 }
7c673cae 171 }
11fdf7f2 172 auto sep = Slice(*last_key_in_current_block);
7c673cae 173
f67539c2
TL
174 assert(!include_first_key_ || !current_block_first_internal_key_.empty());
175 IndexValue entry(block_handle, current_block_first_internal_key_);
176 std::string encoded_entry;
177 std::string delta_encoded_entry;
178 entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
179 if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
180 entry.EncodeTo(&delta_encoded_entry, include_first_key_,
181 &last_encoded_handle_);
182 } else {
183 // If it's the first block, or delta encoding is disabled,
184 // BlockBuilder::Add() below won't use delta-encoded slice.
185 }
11fdf7f2 186 last_encoded_handle_ = block_handle;
f67539c2
TL
187 const Slice delta_encoded_entry_slice(delta_encoded_entry);
188 index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
11fdf7f2 189 if (!seperator_is_key_plus_seq_) {
f67539c2
TL
190 index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
191 &delta_encoded_entry_slice);
11fdf7f2 192 }
f67539c2
TL
193
194 current_block_first_internal_key_.clear();
7c673cae
FG
195 }
196
197 using IndexBuilder::Finish;
198 virtual Status Finish(
199 IndexBlocks* index_blocks,
11fdf7f2
TL
200 const BlockHandle& /*last_partition_block_handle*/) override {
201 if (seperator_is_key_plus_seq_) {
202 index_blocks->index_block_contents = index_block_builder_.Finish();
203 } else {
204 index_blocks->index_block_contents =
205 index_block_builder_without_seq_.Finish();
206 }
207 index_size_ = index_blocks->index_block_contents.size();
7c673cae
FG
208 return Status::OK();
209 }
210
11fdf7f2
TL
211 virtual size_t IndexSize() const override { return index_size_; }
212
213 virtual bool seperator_is_key_plus_seq() override {
214 return seperator_is_key_plus_seq_;
7c673cae
FG
215 }
216
1e59de90
TL
217 // Changes *key to a short string >= *key.
218 //
219 static void FindShortestInternalKeySeparator(const Comparator& comparator,
220 std::string* start,
221 const Slice& limit);
222
223 static void FindShortInternalKeySuccessor(const Comparator& comparator,
224 std::string* key);
225
7c673cae
FG
226 friend class PartitionedIndexBuilder;
227
228 private:
229 BlockBuilder index_block_builder_;
11fdf7f2 230 BlockBuilder index_block_builder_without_seq_;
f67539c2 231 const bool use_value_delta_encoding_;
11fdf7f2 232 bool seperator_is_key_plus_seq_;
f67539c2
TL
233 const bool include_first_key_;
234 BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
235 BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
236 std::string current_block_first_internal_key_;
7c673cae
FG
237};
238
239// HashIndexBuilder contains a binary-searchable primary index and the
240// metadata for secondary hash index construction.
241// The metadata for hash index consists two parts:
242// - a metablock that compactly contains a sequence of prefixes. All prefixes
243// are stored consectively without any metadata (like, prefix sizes) being
244// stored, which is kept in the other metablock.
245// - a metablock contains the metadata of the prefixes, including prefix size,
246// restart index and number of block it spans. The format looks like:
247//
248// +-----------------+---------------------------+---------------------+
249// <=prefix 1
250// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
251// +-----------------+---------------------------+---------------------+
252// <=prefix 2
253// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
254// +-----------------+---------------------------+---------------------+
255// | |
256// | .... |
257// | |
258// +-----------------+---------------------------+---------------------+
259// <=prefix n
260// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
261// +-----------------+---------------------------+---------------------+
262//
263// The reason of separating these two metablocks is to enable the efficiently
264// reuse the first metablock during hash index construction without unnecessary
265// data copy or small heap allocations for prefixes.
266class HashIndexBuilder : public IndexBuilder {
267 public:
f67539c2
TL
268 explicit HashIndexBuilder(
269 const InternalKeyComparator* comparator,
270 const SliceTransform* hash_key_extractor,
271 int index_block_restart_interval, int format_version,
272 bool use_value_delta_encoding,
273 BlockBasedTableOptions::IndexShorteningMode shortening_mode)
7c673cae 274 : IndexBuilder(comparator),
11fdf7f2 275 primary_index_builder_(comparator, index_block_restart_interval,
f67539c2
TL
276 format_version, use_value_delta_encoding,
277 shortening_mode, /* include_first_key */ false),
7c673cae
FG
278 hash_key_extractor_(hash_key_extractor) {}
279
280 virtual void AddIndexEntry(std::string* last_key_in_current_block,
281 const Slice* first_key_in_next_block,
282 const BlockHandle& block_handle) override {
283 ++current_restart_index_;
284 primary_index_builder_.AddIndexEntry(last_key_in_current_block,
285 first_key_in_next_block, block_handle);
286 }
287
288 virtual void OnKeyAdded(const Slice& key) override {
289 auto key_prefix = hash_key_extractor_->Transform(key);
290 bool is_first_entry = pending_block_num_ == 0;
291
292 // Keys may share the prefix
293 if (is_first_entry || pending_entry_prefix_ != key_prefix) {
294 if (!is_first_entry) {
295 FlushPendingPrefix();
296 }
297
298 // need a hard copy otherwise the underlying data changes all the time.
1e59de90
TL
299 // TODO(kailiu) std::to_string() is expensive. We may speed up can avoid
300 // data copy.
7c673cae
FG
301 pending_entry_prefix_ = key_prefix.ToString();
302 pending_block_num_ = 1;
303 pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
304 } else {
305 // entry number increments when keys share the prefix reside in
306 // different data blocks.
307 auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
308 assert(last_restart_index <= current_restart_index_);
309 if (last_restart_index != current_restart_index_) {
310 ++pending_block_num_;
311 }
312 }
313 }
314
315 virtual Status Finish(
316 IndexBlocks* index_blocks,
317 const BlockHandle& last_partition_block_handle) override {
11fdf7f2
TL
318 if (pending_block_num_ != 0) {
319 FlushPendingPrefix();
320 }
20effc67
TL
321 Status s = primary_index_builder_.Finish(index_blocks,
322 last_partition_block_handle);
7c673cae
FG
323 index_blocks->meta_blocks.insert(
324 {kHashIndexPrefixesBlock.c_str(), prefix_block_});
325 index_blocks->meta_blocks.insert(
326 {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
20effc67 327 return s;
7c673cae
FG
328 }
329
11fdf7f2
TL
330 virtual size_t IndexSize() const override {
331 return primary_index_builder_.IndexSize() + prefix_block_.size() +
7c673cae
FG
332 prefix_meta_block_.size();
333 }
334
11fdf7f2
TL
335 virtual bool seperator_is_key_plus_seq() override {
336 return primary_index_builder_.seperator_is_key_plus_seq();
337 }
338
7c673cae
FG
339 private:
340 void FlushPendingPrefix() {
341 prefix_block_.append(pending_entry_prefix_.data(),
342 pending_entry_prefix_.size());
343 PutVarint32Varint32Varint32(
344 &prefix_meta_block_,
345 static_cast<uint32_t>(pending_entry_prefix_.size()),
346 pending_entry_index_, pending_block_num_);
347 }
348
349 ShortenedIndexBuilder primary_index_builder_;
350 const SliceTransform* hash_key_extractor_;
351
352 // stores a sequence of prefixes
353 std::string prefix_block_;
354 // stores the metadata of prefixes
355 std::string prefix_meta_block_;
356
357 // The following 3 variables keeps unflushed prefix and its metadata.
358 // The details of block_num and entry_index can be found in
359 // "block_hash_index.{h,cc}"
360 uint32_t pending_block_num_ = 0;
361 uint32_t pending_entry_index_ = 0;
362 std::string pending_entry_prefix_;
363
364 uint64_t current_restart_index_ = 0;
365};
366
367/**
368 * IndexBuilder for two-level indexing. Internally it creates a new index for
369 * each partition and Finish then in order when Finish is called on it
370 * continiously until Status::OK() is returned.
371 *
372 * The format on the disk would be I I I I I I IP where I is block containing a
373 * partition of indexes built using ShortenedIndexBuilder and IP is a block
374 * containing a secondary index on the partitions, built using
375 * ShortenedIndexBuilder.
376 */
377class PartitionedIndexBuilder : public IndexBuilder {
378 public:
379 static PartitionedIndexBuilder* CreateIndexBuilder(
f67539c2 380 const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
11fdf7f2 381 const bool use_value_delta_encoding,
7c673cae
FG
382 const BlockBasedTableOptions& table_opt);
383
384 explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
11fdf7f2
TL
385 const BlockBasedTableOptions& table_opt,
386 const bool use_value_delta_encoding);
7c673cae
FG
387
388 virtual ~PartitionedIndexBuilder();
389
390 virtual void AddIndexEntry(std::string* last_key_in_current_block,
391 const Slice* first_key_in_next_block,
392 const BlockHandle& block_handle) override;
393
394 virtual Status Finish(
395 IndexBlocks* index_blocks,
396 const BlockHandle& last_partition_block_handle) override;
397
11fdf7f2
TL
398 virtual size_t IndexSize() const override { return index_size_; }
399 size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
400 size_t NumPartitions() const;
7c673cae
FG
401
402 inline bool ShouldCutFilterBlock() {
403 // Current policy is to align the partitions of index and filters
404 if (cut_filter_block) {
405 cut_filter_block = false;
406 return true;
407 }
408 return false;
409 }
410
411 std::string& GetPartitionKey() { return sub_index_last_key_; }
412
11fdf7f2
TL
413 // Called when an external entity (such as filter partition builder) request
414 // cutting the next partition
415 void RequestPartitionCut();
416
417 virtual bool seperator_is_key_plus_seq() override {
418 return seperator_is_key_plus_seq_;
419 }
420
421 bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
422
7c673cae 423 private:
11fdf7f2
TL
424 // Set after ::Finish is called
425 size_t top_level_index_size_ = 0;
426 // Set after ::Finish is called
427 size_t partition_cnt_ = 0;
428
7c673cae
FG
429 void MakeNewSubIndexBuilder();
430
431 struct Entry {
432 std::string key;
433 std::unique_ptr<ShortenedIndexBuilder> value;
434 };
435 std::list<Entry> entries_; // list of partitioned indexes and their keys
f67539c2 436 BlockBuilder index_block_builder_; // top-level index builder
11fdf7f2 437 BlockBuilder index_block_builder_without_seq_; // same for user keys
7c673cae
FG
438 // the active partition index builder
439 ShortenedIndexBuilder* sub_index_builder_;
440 // the last key in the active partition index builder
441 std::string sub_index_last_key_;
442 std::unique_ptr<FlushBlockPolicy> flush_policy_;
443 // true if Finish is called once but not complete yet.
444 bool finishing_indexes = false;
445 const BlockBasedTableOptions& table_opt_;
11fdf7f2
TL
446 bool seperator_is_key_plus_seq_;
447 bool use_value_delta_encoding_;
448 // true if an external entity (such as filter partition builder) request
449 // cutting the next partition
450 bool partition_cut_requested_ = true;
7c673cae
FG
451 // true if it should cut the next filter partition block
452 bool cut_filter_block = false;
11fdf7f2 453 BlockHandle last_encoded_handle_;
7c673cae 454};
f67539c2 455} // namespace ROCKSDB_NAMESPACE