1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
17 #include <unordered_map>
19 #include "rocksdb/comparator.h"
20 #include "table/block_based_table_factory.h"
21 #include "table/block_builder.h"
22 #include "table/format.h"
25 // The interface for building index.
26 // Instruction for adding a new concrete IndexBuilder:
27 // 1. Create a subclass instantiated from IndexBuilder.
28 // 2. Add a new entry associated with that subclass in TableOptions::IndexType.
29 // 3. Add a create function for the new subclass in CreateIndexBuilder.
30 // Note: we can devise more advanced design to simplify the process for adding
31 // new subclass, which will, on the other hand, increase the code complexity and
32 // catch unwanted attention from readers. Given that we won't add/change
33 // indexes frequently, it makes sense to just embrace a more straightforward
34 // design that just works.
37 static IndexBuilder
* CreateIndexBuilder(
38 BlockBasedTableOptions::IndexType index_type
,
39 const rocksdb::InternalKeyComparator
* comparator
,
40 const InternalKeySliceTransform
* int_key_slice_transform
,
41 const bool use_value_delta_encoding
,
42 const BlockBasedTableOptions
& table_opt
);
44 // Index builder will construct a set of blocks which contain:
45 // 1. One primary index block.
46 // 2. (Optional) a set of metablocks that contains the metadata of the
49 Slice index_block_contents
;
50 std::unordered_map
<std::string
, Slice
> meta_blocks
;
52 explicit IndexBuilder(const InternalKeyComparator
* comparator
)
53 : comparator_(comparator
) {}
55 virtual ~IndexBuilder() {}
57 // Add a new index entry to index block.
58 // To allow further optimization, we provide `last_key_in_current_block` and
59 // `first_key_in_next_block`, based on which the specific implementation can
60 // determine the best index key to be used for the index block.
61 // @last_key_in_current_block: this parameter maybe overridden with the value
63 // @first_key_in_next_block: it will be nullptr if the entry being added is
64 // the last one in the table
66 // REQUIRES: Finish() has not yet been called.
67 virtual void AddIndexEntry(std::string
* last_key_in_current_block
,
68 const Slice
* first_key_in_next_block
,
69 const BlockHandle
& block_handle
) = 0;
71 // This method will be called whenever a key is added. The subclasses may
72 // override OnKeyAdded() if they need to collect additional information.
73 virtual void OnKeyAdded(const Slice
& /*key*/) {}
75 // Inform the index builder that all entries has been written. Block builder
76 // may therefore perform any operation required for block finalization.
78 // REQUIRES: Finish() has not yet been called.
79 inline Status
Finish(IndexBlocks
* index_blocks
) {
80 // Throw away the changes to last_partition_block_handle. It has no effect
81 // on the first call to Finish anyway.
82 BlockHandle last_partition_block_handle
;
83 return Finish(index_blocks
, last_partition_block_handle
);
86 // This override of Finish can be utilized to build the 2nd level index in
87 // PartitionIndexBuilder.
89 // index_blocks will be filled with the resulting index data. If the return
90 // value is Status::InComplete() then it means that the index is partitioned
91 // and the callee should keep calling Finish until Status::OK() is returned.
92 // In that case, last_partition_block_handle is pointer to the block written
93 // with the result of the last call to Finish. This can be utilized to build
94 // the second level index pointing to each block of partitioned indexes. The
95 // last call to Finish() that returns Status::OK() populates index_blocks with
96 // the 2nd level index content.
97 virtual Status
Finish(IndexBlocks
* index_blocks
,
98 const BlockHandle
& last_partition_block_handle
) = 0;
100 // Get the size for index block. Must be called after ::Finish.
101 virtual size_t IndexSize() const = 0;
103 virtual bool seperator_is_key_plus_seq() { return true; }
106 const InternalKeyComparator
* comparator_
;
107 // Set after ::Finish is called
108 size_t index_size_
= 0;
111 // This index builder builds space-efficient index block.
114 // 1. Made block's `block_restart_interval` to be 1, which will avoid linear
115 // search when doing index lookup (can be disabled by setting
116 // index_block_restart_interval).
117 // 2. Shorten the key length for index block. Other than honestly using the
118 // last key in the data block as the index key, we instead find a shortest
119 // substitute key that serves the same function.
120 class ShortenedIndexBuilder
: public IndexBuilder
{
122 explicit ShortenedIndexBuilder(const InternalKeyComparator
* comparator
,
123 const int index_block_restart_interval
,
124 const uint32_t format_version
,
125 const bool use_value_delta_encoding
)
126 : IndexBuilder(comparator
),
127 index_block_builder_(index_block_restart_interval
,
128 true /*use_delta_encoding*/,
129 use_value_delta_encoding
),
130 index_block_builder_without_seq_(index_block_restart_interval
,
131 true /*use_delta_encoding*/,
132 use_value_delta_encoding
) {
133 // Making the default true will disable the feature for old versions
134 seperator_is_key_plus_seq_
= (format_version
<= 2);
137 virtual void AddIndexEntry(std::string
* last_key_in_current_block
,
138 const Slice
* first_key_in_next_block
,
139 const BlockHandle
& block_handle
) override
{
140 if (first_key_in_next_block
!= nullptr) {
141 comparator_
->FindShortestSeparator(last_key_in_current_block
,
142 *first_key_in_next_block
);
143 if (!seperator_is_key_plus_seq_
&&
144 comparator_
->user_comparator()->Compare(
145 ExtractUserKey(*last_key_in_current_block
),
146 ExtractUserKey(*first_key_in_next_block
)) == 0) {
147 seperator_is_key_plus_seq_
= true;
150 comparator_
->FindShortSuccessor(last_key_in_current_block
);
152 auto sep
= Slice(*last_key_in_current_block
);
154 std::string handle_encoding
;
155 block_handle
.EncodeTo(&handle_encoding
);
156 std::string handle_delta_encoding
;
157 PutVarsignedint64(&handle_delta_encoding
,
158 block_handle
.size() - last_encoded_handle_
.size());
159 assert(handle_delta_encoding
.size() != 0);
160 last_encoded_handle_
= block_handle
;
161 const Slice
handle_delta_encoding_slice(handle_delta_encoding
);
162 index_block_builder_
.Add(sep
, handle_encoding
,
163 &handle_delta_encoding_slice
);
164 if (!seperator_is_key_plus_seq_
) {
165 index_block_builder_without_seq_
.Add(ExtractUserKey(sep
), handle_encoding
,
166 &handle_delta_encoding_slice
);
170 using IndexBuilder::Finish
;
171 virtual Status
Finish(
172 IndexBlocks
* index_blocks
,
173 const BlockHandle
& /*last_partition_block_handle*/) override
{
174 if (seperator_is_key_plus_seq_
) {
175 index_blocks
->index_block_contents
= index_block_builder_
.Finish();
177 index_blocks
->index_block_contents
=
178 index_block_builder_without_seq_
.Finish();
180 index_size_
= index_blocks
->index_block_contents
.size();
184 virtual size_t IndexSize() const override
{ return index_size_
; }
186 virtual bool seperator_is_key_plus_seq() override
{
187 return seperator_is_key_plus_seq_
;
190 friend class PartitionedIndexBuilder
;
193 BlockBuilder index_block_builder_
;
194 BlockBuilder index_block_builder_without_seq_
;
195 bool seperator_is_key_plus_seq_
;
196 BlockHandle last_encoded_handle_
;
199 // HashIndexBuilder contains a binary-searchable primary index and the
200 // metadata for secondary hash index construction.
201 // The metadata for hash index consists two parts:
202 // - a metablock that compactly contains a sequence of prefixes. All prefixes
203 // are stored consectively without any metadata (like, prefix sizes) being
204 // stored, which is kept in the other metablock.
205 // - a metablock contains the metadata of the prefixes, including prefix size,
206 // restart index and number of block it spans. The format looks like:
208 // +-----------------+---------------------------+---------------------+
210 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
211 // +-----------------+---------------------------+---------------------+
213 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
214 // +-----------------+---------------------------+---------------------+
218 // +-----------------+---------------------------+---------------------+
220 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
221 // +-----------------+---------------------------+---------------------+
223 // The reason of separating these two metablocks is to enable the efficiently
224 // reuse the first metablock during hash index construction without unnecessary
225 // data copy or small heap allocations for prefixes.
226 class HashIndexBuilder
: public IndexBuilder
{
228 explicit HashIndexBuilder(const InternalKeyComparator
* comparator
,
229 const SliceTransform
* hash_key_extractor
,
230 int index_block_restart_interval
,
231 int format_version
, bool use_value_delta_encoding
)
232 : IndexBuilder(comparator
),
233 primary_index_builder_(comparator
, index_block_restart_interval
,
234 format_version
, use_value_delta_encoding
),
235 hash_key_extractor_(hash_key_extractor
) {}
237 virtual void AddIndexEntry(std::string
* last_key_in_current_block
,
238 const Slice
* first_key_in_next_block
,
239 const BlockHandle
& block_handle
) override
{
240 ++current_restart_index_
;
241 primary_index_builder_
.AddIndexEntry(last_key_in_current_block
,
242 first_key_in_next_block
, block_handle
);
245 virtual void OnKeyAdded(const Slice
& key
) override
{
246 auto key_prefix
= hash_key_extractor_
->Transform(key
);
247 bool is_first_entry
= pending_block_num_
== 0;
249 // Keys may share the prefix
250 if (is_first_entry
|| pending_entry_prefix_
!= key_prefix
) {
251 if (!is_first_entry
) {
252 FlushPendingPrefix();
255 // need a hard copy otherwise the underlying data changes all the time.
256 // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
258 pending_entry_prefix_
= key_prefix
.ToString();
259 pending_block_num_
= 1;
260 pending_entry_index_
= static_cast<uint32_t>(current_restart_index_
);
262 // entry number increments when keys share the prefix reside in
263 // different data blocks.
264 auto last_restart_index
= pending_entry_index_
+ pending_block_num_
- 1;
265 assert(last_restart_index
<= current_restart_index_
);
266 if (last_restart_index
!= current_restart_index_
) {
267 ++pending_block_num_
;
272 virtual Status
Finish(
273 IndexBlocks
* index_blocks
,
274 const BlockHandle
& last_partition_block_handle
) override
{
275 if (pending_block_num_
!= 0) {
276 FlushPendingPrefix();
278 primary_index_builder_
.Finish(index_blocks
, last_partition_block_handle
);
279 index_blocks
->meta_blocks
.insert(
280 {kHashIndexPrefixesBlock
.c_str(), prefix_block_
});
281 index_blocks
->meta_blocks
.insert(
282 {kHashIndexPrefixesMetadataBlock
.c_str(), prefix_meta_block_
});
286 virtual size_t IndexSize() const override
{
287 return primary_index_builder_
.IndexSize() + prefix_block_
.size() +
288 prefix_meta_block_
.size();
291 virtual bool seperator_is_key_plus_seq() override
{
292 return primary_index_builder_
.seperator_is_key_plus_seq();
296 void FlushPendingPrefix() {
297 prefix_block_
.append(pending_entry_prefix_
.data(),
298 pending_entry_prefix_
.size());
299 PutVarint32Varint32Varint32(
301 static_cast<uint32_t>(pending_entry_prefix_
.size()),
302 pending_entry_index_
, pending_block_num_
);
305 ShortenedIndexBuilder primary_index_builder_
;
306 const SliceTransform
* hash_key_extractor_
;
308 // stores a sequence of prefixes
309 std::string prefix_block_
;
310 // stores the metadata of prefixes
311 std::string prefix_meta_block_
;
313 // The following 3 variables keeps unflushed prefix and its metadata.
314 // The details of block_num and entry_index can be found in
315 // "block_hash_index.{h,cc}"
316 uint32_t pending_block_num_
= 0;
317 uint32_t pending_entry_index_
= 0;
318 std::string pending_entry_prefix_
;
320 uint64_t current_restart_index_
= 0;
324 * IndexBuilder for two-level indexing. Internally it creates a new index for
325 * each partition and Finish then in order when Finish is called on it
326 * continiously until Status::OK() is returned.
328 * The format on the disk would be I I I I I I IP where I is block containing a
329 * partition of indexes built using ShortenedIndexBuilder and IP is a block
330 * containing a secondary index on the partitions, built using
331 * ShortenedIndexBuilder.
333 class PartitionedIndexBuilder
: public IndexBuilder
{
335 static PartitionedIndexBuilder
* CreateIndexBuilder(
336 const rocksdb::InternalKeyComparator
* comparator
,
337 const bool use_value_delta_encoding
,
338 const BlockBasedTableOptions
& table_opt
);
340 explicit PartitionedIndexBuilder(const InternalKeyComparator
* comparator
,
341 const BlockBasedTableOptions
& table_opt
,
342 const bool use_value_delta_encoding
);
344 virtual ~PartitionedIndexBuilder();
346 virtual void AddIndexEntry(std::string
* last_key_in_current_block
,
347 const Slice
* first_key_in_next_block
,
348 const BlockHandle
& block_handle
) override
;
350 virtual Status
Finish(
351 IndexBlocks
* index_blocks
,
352 const BlockHandle
& last_partition_block_handle
) override
;
354 virtual size_t IndexSize() const override
{ return index_size_
; }
355 size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_
; }
356 size_t NumPartitions() const;
358 inline bool ShouldCutFilterBlock() {
359 // Current policy is to align the partitions of index and filters
360 if (cut_filter_block
) {
361 cut_filter_block
= false;
367 std::string
& GetPartitionKey() { return sub_index_last_key_
; }
369 // Called when an external entity (such as filter partition builder) request
370 // cutting the next partition
371 void RequestPartitionCut();
373 virtual bool seperator_is_key_plus_seq() override
{
374 return seperator_is_key_plus_seq_
;
377 bool get_use_value_delta_encoding() { return use_value_delta_encoding_
; }
380 // Set after ::Finish is called
381 size_t top_level_index_size_
= 0;
382 // Set after ::Finish is called
383 size_t partition_cnt_
= 0;
385 void MakeNewSubIndexBuilder();
389 std::unique_ptr
<ShortenedIndexBuilder
> value
;
391 std::list
<Entry
> entries_
; // list of partitioned indexes and their keys
392 BlockBuilder index_block_builder_
; // top-level index builder
393 BlockBuilder index_block_builder_without_seq_
; // same for user keys
394 // the active partition index builder
395 ShortenedIndexBuilder
* sub_index_builder_
;
396 // the last key in the active partition index builder
397 std::string sub_index_last_key_
;
398 std::unique_ptr
<FlushBlockPolicy
> flush_policy_
;
399 // true if Finish is called once but not complete yet.
400 bool finishing_indexes
= false;
401 const BlockBasedTableOptions
& table_opt_
;
402 bool seperator_is_key_plus_seq_
;
403 bool use_value_delta_encoding_
;
404 // true if an external entity (such as filter partition builder) request
405 // cutting the next partition
406 bool partition_cut_requested_
= true;
407 // true if it should cut the next filter partition block
408 bool cut_filter_block
= false;
409 BlockHandle last_encoded_handle_
;
411 } // namespace rocksdb