]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/index_builder.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / block_based / index_builder.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <assert.h>
f67539c2 13#include <cinttypes>
7c673cae
FG
14
15#include <list>
16#include <string>
17#include <unordered_map>
18
19#include "rocksdb/comparator.h"
f67539c2
TL
20#include "table/block_based/block_based_table_factory.h"
21#include "table/block_based/block_builder.h"
7c673cae
FG
22#include "table/format.h"
23
f67539c2 24namespace ROCKSDB_NAMESPACE {
7c673cae
FG
25// The interface for building index.
26// Instruction for adding a new concrete IndexBuilder:
27// 1. Create a subclass instantiated from IndexBuilder.
28// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
29// 3. Add a create function for the new subclass in CreateIndexBuilder.
30// Note: we can devise more advanced design to simplify the process for adding
31// new subclass, which will, on the other hand, increase the code complexity and
32// catch unwanted attention from readers. Given that we won't add/change
33// indexes frequently, it makes sense to just embrace a more straightforward
34// design that just works.
35class IndexBuilder {
36 public:
37 static IndexBuilder* CreateIndexBuilder(
38 BlockBasedTableOptions::IndexType index_type,
f67539c2 39 const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
7c673cae 40 const InternalKeySliceTransform* int_key_slice_transform,
11fdf7f2 41 const bool use_value_delta_encoding,
7c673cae
FG
42 const BlockBasedTableOptions& table_opt);
43
44 // Index builder will construct a set of blocks which contain:
45 // 1. One primary index block.
46 // 2. (Optional) a set of metablocks that contains the metadata of the
47 // primary index.
48 struct IndexBlocks {
49 Slice index_block_contents;
50 std::unordered_map<std::string, Slice> meta_blocks;
51 };
52 explicit IndexBuilder(const InternalKeyComparator* comparator)
53 : comparator_(comparator) {}
54
55 virtual ~IndexBuilder() {}
56
57 // Add a new index entry to index block.
58 // To allow further optimization, we provide `last_key_in_current_block` and
59 // `first_key_in_next_block`, based on which the specific implementation can
60 // determine the best index key to be used for the index block.
f67539c2 61 // Called before the OnKeyAdded() call for first_key_in_next_block.
7c673cae
FG
62 // @last_key_in_current_block: this parameter maybe overridden with the value
63 // "substitute key".
64 // @first_key_in_next_block: it will be nullptr if the entry being added is
65 // the last one in the table
66 //
67 // REQUIRES: Finish() has not yet been called.
68 virtual void AddIndexEntry(std::string* last_key_in_current_block,
69 const Slice* first_key_in_next_block,
70 const BlockHandle& block_handle) = 0;
71
72 // This method will be called whenever a key is added. The subclasses may
73 // override OnKeyAdded() if they need to collect additional information.
11fdf7f2 74 virtual void OnKeyAdded(const Slice& /*key*/) {}
7c673cae
FG
75
76 // Inform the index builder that all entries has been written. Block builder
77 // may therefore perform any operation required for block finalization.
78 //
79 // REQUIRES: Finish() has not yet been called.
80 inline Status Finish(IndexBlocks* index_blocks) {
81 // Throw away the changes to last_partition_block_handle. It has no effect
82 // on the first call to Finish anyway.
83 BlockHandle last_partition_block_handle;
84 return Finish(index_blocks, last_partition_block_handle);
85 }
86
87 // This override of Finish can be utilized to build the 2nd level index in
88 // PartitionIndexBuilder.
89 //
90 // index_blocks will be filled with the resulting index data. If the return
91 // value is Status::InComplete() then it means that the index is partitioned
92 // and the callee should keep calling Finish until Status::OK() is returned.
93 // In that case, last_partition_block_handle is pointer to the block written
94 // with the result of the last call to Finish. This can be utilized to build
95 // the second level index pointing to each block of partitioned indexes. The
96 // last call to Finish() that returns Status::OK() populates index_blocks with
97 // the 2nd level index content.
98 virtual Status Finish(IndexBlocks* index_blocks,
99 const BlockHandle& last_partition_block_handle) = 0;
100
11fdf7f2
TL
101 // Get the size for index block. Must be called after ::Finish.
102 virtual size_t IndexSize() const = 0;
103
104 virtual bool seperator_is_key_plus_seq() { return true; }
7c673cae
FG
105
106 protected:
107 const InternalKeyComparator* comparator_;
11fdf7f2
TL
108 // Set after ::Finish is called
109 size_t index_size_ = 0;
7c673cae
FG
110};
111
112// This index builder builds space-efficient index block.
113//
114// Optimizations:
115// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
116// search when doing index lookup (can be disabled by setting
117// index_block_restart_interval).
118// 2. Shorten the key length for index block. Other than honestly using the
119// last key in the data block as the index key, we instead find a shortest
120// substitute key that serves the same function.
121class ShortenedIndexBuilder : public IndexBuilder {
122 public:
f67539c2
TL
123 explicit ShortenedIndexBuilder(
124 const InternalKeyComparator* comparator,
125 const int index_block_restart_interval, const uint32_t format_version,
126 const bool use_value_delta_encoding,
127 BlockBasedTableOptions::IndexShorteningMode shortening_mode,
128 bool include_first_key)
7c673cae 129 : IndexBuilder(comparator),
11fdf7f2
TL
130 index_block_builder_(index_block_restart_interval,
131 true /*use_delta_encoding*/,
132 use_value_delta_encoding),
133 index_block_builder_without_seq_(index_block_restart_interval,
134 true /*use_delta_encoding*/,
f67539c2
TL
135 use_value_delta_encoding),
136 use_value_delta_encoding_(use_value_delta_encoding),
137 include_first_key_(include_first_key),
138 shortening_mode_(shortening_mode) {
11fdf7f2
TL
139 // Making the default true will disable the feature for old versions
140 seperator_is_key_plus_seq_ = (format_version <= 2);
141 }
7c673cae 142
f67539c2
TL
143 virtual void OnKeyAdded(const Slice& key) override {
144 if (include_first_key_ && current_block_first_internal_key_.empty()) {
145 current_block_first_internal_key_.assign(key.data(), key.size());
146 }
147 }
148
7c673cae
FG
149 virtual void AddIndexEntry(std::string* last_key_in_current_block,
150 const Slice* first_key_in_next_block,
151 const BlockHandle& block_handle) override {
152 if (first_key_in_next_block != nullptr) {
f67539c2
TL
153 if (shortening_mode_ !=
154 BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
155 comparator_->FindShortestSeparator(last_key_in_current_block,
156 *first_key_in_next_block);
157 }
11fdf7f2
TL
158 if (!seperator_is_key_plus_seq_ &&
159 comparator_->user_comparator()->Compare(
160 ExtractUserKey(*last_key_in_current_block),
161 ExtractUserKey(*first_key_in_next_block)) == 0) {
162 seperator_is_key_plus_seq_ = true;
163 }
7c673cae 164 } else {
f67539c2
TL
165 if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
166 kShortenSeparatorsAndSuccessor) {
167 comparator_->FindShortSuccessor(last_key_in_current_block);
168 }
7c673cae 169 }
11fdf7f2 170 auto sep = Slice(*last_key_in_current_block);
7c673cae 171
f67539c2
TL
172 assert(!include_first_key_ || !current_block_first_internal_key_.empty());
173 IndexValue entry(block_handle, current_block_first_internal_key_);
174 std::string encoded_entry;
175 std::string delta_encoded_entry;
176 entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
177 if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
178 entry.EncodeTo(&delta_encoded_entry, include_first_key_,
179 &last_encoded_handle_);
180 } else {
181 // If it's the first block, or delta encoding is disabled,
182 // BlockBuilder::Add() below won't use delta-encoded slice.
183 }
11fdf7f2 184 last_encoded_handle_ = block_handle;
f67539c2
TL
185 const Slice delta_encoded_entry_slice(delta_encoded_entry);
186 index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
11fdf7f2 187 if (!seperator_is_key_plus_seq_) {
f67539c2
TL
188 index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
189 &delta_encoded_entry_slice);
11fdf7f2 190 }
f67539c2
TL
191
192 current_block_first_internal_key_.clear();
7c673cae
FG
193 }
194
195 using IndexBuilder::Finish;
196 virtual Status Finish(
197 IndexBlocks* index_blocks,
11fdf7f2
TL
198 const BlockHandle& /*last_partition_block_handle*/) override {
199 if (seperator_is_key_plus_seq_) {
200 index_blocks->index_block_contents = index_block_builder_.Finish();
201 } else {
202 index_blocks->index_block_contents =
203 index_block_builder_without_seq_.Finish();
204 }
205 index_size_ = index_blocks->index_block_contents.size();
7c673cae
FG
206 return Status::OK();
207 }
208
11fdf7f2
TL
209 virtual size_t IndexSize() const override { return index_size_; }
210
211 virtual bool seperator_is_key_plus_seq() override {
212 return seperator_is_key_plus_seq_;
7c673cae
FG
213 }
214
215 friend class PartitionedIndexBuilder;
216
217 private:
218 BlockBuilder index_block_builder_;
11fdf7f2 219 BlockBuilder index_block_builder_without_seq_;
f67539c2 220 const bool use_value_delta_encoding_;
11fdf7f2 221 bool seperator_is_key_plus_seq_;
f67539c2
TL
222 const bool include_first_key_;
223 BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
224 BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
225 std::string current_block_first_internal_key_;
7c673cae
FG
226};
227
228// HashIndexBuilder contains a binary-searchable primary index and the
229// metadata for secondary hash index construction.
230// The metadata for hash index consists two parts:
231// - a metablock that compactly contains a sequence of prefixes. All prefixes
232// are stored consectively without any metadata (like, prefix sizes) being
233// stored, which is kept in the other metablock.
234// - a metablock contains the metadata of the prefixes, including prefix size,
235// restart index and number of block it spans. The format looks like:
236//
237// +-----------------+---------------------------+---------------------+
238// <=prefix 1
239// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
240// +-----------------+---------------------------+---------------------+
241// <=prefix 2
242// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
243// +-----------------+---------------------------+---------------------+
244// | |
245// | .... |
246// | |
247// +-----------------+---------------------------+---------------------+
248// <=prefix n
249// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
250// +-----------------+---------------------------+---------------------+
251//
252// The reason of separating these two metablocks is to enable the efficiently
253// reuse the first metablock during hash index construction without unnecessary
254// data copy or small heap allocations for prefixes.
255class HashIndexBuilder : public IndexBuilder {
256 public:
f67539c2
TL
257 explicit HashIndexBuilder(
258 const InternalKeyComparator* comparator,
259 const SliceTransform* hash_key_extractor,
260 int index_block_restart_interval, int format_version,
261 bool use_value_delta_encoding,
262 BlockBasedTableOptions::IndexShorteningMode shortening_mode)
7c673cae 263 : IndexBuilder(comparator),
11fdf7f2 264 primary_index_builder_(comparator, index_block_restart_interval,
f67539c2
TL
265 format_version, use_value_delta_encoding,
266 shortening_mode, /* include_first_key */ false),
7c673cae
FG
267 hash_key_extractor_(hash_key_extractor) {}
268
269 virtual void AddIndexEntry(std::string* last_key_in_current_block,
270 const Slice* first_key_in_next_block,
271 const BlockHandle& block_handle) override {
272 ++current_restart_index_;
273 primary_index_builder_.AddIndexEntry(last_key_in_current_block,
274 first_key_in_next_block, block_handle);
275 }
276
277 virtual void OnKeyAdded(const Slice& key) override {
278 auto key_prefix = hash_key_extractor_->Transform(key);
279 bool is_first_entry = pending_block_num_ == 0;
280
281 // Keys may share the prefix
282 if (is_first_entry || pending_entry_prefix_ != key_prefix) {
283 if (!is_first_entry) {
284 FlushPendingPrefix();
285 }
286
287 // need a hard copy otherwise the underlying data changes all the time.
288 // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
289 // copy.
290 pending_entry_prefix_ = key_prefix.ToString();
291 pending_block_num_ = 1;
292 pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
293 } else {
294 // entry number increments when keys share the prefix reside in
295 // different data blocks.
296 auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
297 assert(last_restart_index <= current_restart_index_);
298 if (last_restart_index != current_restart_index_) {
299 ++pending_block_num_;
300 }
301 }
302 }
303
304 virtual Status Finish(
305 IndexBlocks* index_blocks,
306 const BlockHandle& last_partition_block_handle) override {
11fdf7f2
TL
307 if (pending_block_num_ != 0) {
308 FlushPendingPrefix();
309 }
20effc67
TL
310 Status s = primary_index_builder_.Finish(index_blocks,
311 last_partition_block_handle);
7c673cae
FG
312 index_blocks->meta_blocks.insert(
313 {kHashIndexPrefixesBlock.c_str(), prefix_block_});
314 index_blocks->meta_blocks.insert(
315 {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
20effc67 316 return s;
7c673cae
FG
317 }
318
11fdf7f2
TL
319 virtual size_t IndexSize() const override {
320 return primary_index_builder_.IndexSize() + prefix_block_.size() +
7c673cae
FG
321 prefix_meta_block_.size();
322 }
323
11fdf7f2
TL
324 virtual bool seperator_is_key_plus_seq() override {
325 return primary_index_builder_.seperator_is_key_plus_seq();
326 }
327
7c673cae
FG
328 private:
329 void FlushPendingPrefix() {
330 prefix_block_.append(pending_entry_prefix_.data(),
331 pending_entry_prefix_.size());
332 PutVarint32Varint32Varint32(
333 &prefix_meta_block_,
334 static_cast<uint32_t>(pending_entry_prefix_.size()),
335 pending_entry_index_, pending_block_num_);
336 }
337
338 ShortenedIndexBuilder primary_index_builder_;
339 const SliceTransform* hash_key_extractor_;
340
341 // stores a sequence of prefixes
342 std::string prefix_block_;
343 // stores the metadata of prefixes
344 std::string prefix_meta_block_;
345
346 // The following 3 variables keeps unflushed prefix and its metadata.
347 // The details of block_num and entry_index can be found in
348 // "block_hash_index.{h,cc}"
349 uint32_t pending_block_num_ = 0;
350 uint32_t pending_entry_index_ = 0;
351 std::string pending_entry_prefix_;
352
353 uint64_t current_restart_index_ = 0;
354};
355
356/**
357 * IndexBuilder for two-level indexing. Internally it creates a new index for
358 * each partition and Finish then in order when Finish is called on it
359 * continiously until Status::OK() is returned.
360 *
361 * The format on the disk would be I I I I I I IP where I is block containing a
362 * partition of indexes built using ShortenedIndexBuilder and IP is a block
363 * containing a secondary index on the partitions, built using
364 * ShortenedIndexBuilder.
365 */
366class PartitionedIndexBuilder : public IndexBuilder {
367 public:
368 static PartitionedIndexBuilder* CreateIndexBuilder(
f67539c2 369 const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
11fdf7f2 370 const bool use_value_delta_encoding,
7c673cae
FG
371 const BlockBasedTableOptions& table_opt);
372
373 explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
11fdf7f2
TL
374 const BlockBasedTableOptions& table_opt,
375 const bool use_value_delta_encoding);
7c673cae
FG
376
377 virtual ~PartitionedIndexBuilder();
378
379 virtual void AddIndexEntry(std::string* last_key_in_current_block,
380 const Slice* first_key_in_next_block,
381 const BlockHandle& block_handle) override;
382
383 virtual Status Finish(
384 IndexBlocks* index_blocks,
385 const BlockHandle& last_partition_block_handle) override;
386
11fdf7f2
TL
387 virtual size_t IndexSize() const override { return index_size_; }
388 size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
389 size_t NumPartitions() const;
7c673cae
FG
390
391 inline bool ShouldCutFilterBlock() {
392 // Current policy is to align the partitions of index and filters
393 if (cut_filter_block) {
394 cut_filter_block = false;
395 return true;
396 }
397 return false;
398 }
399
400 std::string& GetPartitionKey() { return sub_index_last_key_; }
401
11fdf7f2
TL
402 // Called when an external entity (such as filter partition builder) request
403 // cutting the next partition
404 void RequestPartitionCut();
405
406 virtual bool seperator_is_key_plus_seq() override {
407 return seperator_is_key_plus_seq_;
408 }
409
410 bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
411
7c673cae 412 private:
11fdf7f2
TL
413 // Set after ::Finish is called
414 size_t top_level_index_size_ = 0;
415 // Set after ::Finish is called
416 size_t partition_cnt_ = 0;
417
7c673cae
FG
418 void MakeNewSubIndexBuilder();
419
420 struct Entry {
421 std::string key;
422 std::unique_ptr<ShortenedIndexBuilder> value;
423 };
424 std::list<Entry> entries_; // list of partitioned indexes and their keys
f67539c2 425 BlockBuilder index_block_builder_; // top-level index builder
11fdf7f2 426 BlockBuilder index_block_builder_without_seq_; // same for user keys
7c673cae
FG
427 // the active partition index builder
428 ShortenedIndexBuilder* sub_index_builder_;
429 // the last key in the active partition index builder
430 std::string sub_index_last_key_;
431 std::unique_ptr<FlushBlockPolicy> flush_policy_;
432 // true if Finish is called once but not complete yet.
433 bool finishing_indexes = false;
434 const BlockBasedTableOptions& table_opt_;
11fdf7f2
TL
435 bool seperator_is_key_plus_seq_;
436 bool use_value_delta_encoding_;
437 // true if an external entity (such as filter partition builder) request
438 // cutting the next partition
439 bool partition_cut_requested_ = true;
7c673cae
FG
440 // true if it should cut the next filter partition block
441 bool cut_filter_block = false;
11fdf7f2 442 BlockHandle last_encoded_handle_;
7c673cae 443};
f67539c2 444} // namespace ROCKSDB_NAMESPACE