]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
12 | #include <assert.h> | |
7c673cae | 13 | |
1e59de90 | 14 | #include <cinttypes> |
7c673cae FG |
15 | #include <list> |
16 | #include <string> | |
17 | #include <unordered_map> | |
18 | ||
19 | #include "rocksdb/comparator.h" | |
f67539c2 TL |
20 | #include "table/block_based/block_based_table_factory.h" |
21 | #include "table/block_based/block_builder.h" | |
7c673cae FG |
22 | #include "table/format.h" |
23 | ||
f67539c2 | 24 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
25 | // The interface for building index. |
26 | // Instruction for adding a new concrete IndexBuilder: | |
27 | // 1. Create a subclass instantiated from IndexBuilder. | |
28 | // 2. Add a new entry associated with that subclass in TableOptions::IndexType. | |
29 | // 3. Add a create function for the new subclass in CreateIndexBuilder. | |
30 | // Note: we can devise more advanced design to simplify the process for adding | |
31 | // new subclass, which will, on the other hand, increase the code complexity and | |
32 | // catch unwanted attention from readers. Given that we won't add/change | |
33 | // indexes frequently, it makes sense to just embrace a more straightforward | |
34 | // design that just works. | |
35 | class IndexBuilder { | |
36 | public: | |
37 | static IndexBuilder* CreateIndexBuilder( | |
38 | BlockBasedTableOptions::IndexType index_type, | |
f67539c2 | 39 | const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, |
7c673cae | 40 | const InternalKeySliceTransform* int_key_slice_transform, |
11fdf7f2 | 41 | const bool use_value_delta_encoding, |
7c673cae FG |
42 | const BlockBasedTableOptions& table_opt); |
43 | ||
44 | // Index builder will construct a set of blocks which contain: | |
45 | // 1. One primary index block. | |
46 | // 2. (Optional) a set of metablocks that contains the metadata of the | |
47 | // primary index. | |
48 | struct IndexBlocks { | |
49 | Slice index_block_contents; | |
50 | std::unordered_map<std::string, Slice> meta_blocks; | |
51 | }; | |
52 | explicit IndexBuilder(const InternalKeyComparator* comparator) | |
53 | : comparator_(comparator) {} | |
54 | ||
55 | virtual ~IndexBuilder() {} | |
56 | ||
57 | // Add a new index entry to index block. | |
58 | // To allow further optimization, we provide `last_key_in_current_block` and | |
59 | // `first_key_in_next_block`, based on which the specific implementation can | |
60 | // determine the best index key to be used for the index block. | |
f67539c2 | 61 | // Called before the OnKeyAdded() call for first_key_in_next_block. |
7c673cae FG |
62 | // @last_key_in_current_block: this parameter maybe overridden with the value |
63 | // "substitute key". | |
64 | // @first_key_in_next_block: it will be nullptr if the entry being added is | |
65 | // the last one in the table | |
66 | // | |
67 | // REQUIRES: Finish() has not yet been called. | |
68 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
69 | const Slice* first_key_in_next_block, | |
70 | const BlockHandle& block_handle) = 0; | |
71 | ||
72 | // This method will be called whenever a key is added. The subclasses may | |
73 | // override OnKeyAdded() if they need to collect additional information. | |
11fdf7f2 | 74 | virtual void OnKeyAdded(const Slice& /*key*/) {} |
7c673cae FG |
75 | |
76 | // Inform the index builder that all entries has been written. Block builder | |
77 | // may therefore perform any operation required for block finalization. | |
78 | // | |
79 | // REQUIRES: Finish() has not yet been called. | |
80 | inline Status Finish(IndexBlocks* index_blocks) { | |
81 | // Throw away the changes to last_partition_block_handle. It has no effect | |
82 | // on the first call to Finish anyway. | |
83 | BlockHandle last_partition_block_handle; | |
84 | return Finish(index_blocks, last_partition_block_handle); | |
85 | } | |
86 | ||
87 | // This override of Finish can be utilized to build the 2nd level index in | |
88 | // PartitionIndexBuilder. | |
89 | // | |
90 | // index_blocks will be filled with the resulting index data. If the return | |
91 | // value is Status::InComplete() then it means that the index is partitioned | |
92 | // and the callee should keep calling Finish until Status::OK() is returned. | |
93 | // In that case, last_partition_block_handle is pointer to the block written | |
94 | // with the result of the last call to Finish. This can be utilized to build | |
95 | // the second level index pointing to each block of partitioned indexes. The | |
96 | // last call to Finish() that returns Status::OK() populates index_blocks with | |
97 | // the 2nd level index content. | |
98 | virtual Status Finish(IndexBlocks* index_blocks, | |
99 | const BlockHandle& last_partition_block_handle) = 0; | |
100 | ||
11fdf7f2 TL |
101 | // Get the size for index block. Must be called after ::Finish. |
102 | virtual size_t IndexSize() const = 0; | |
103 | ||
104 | virtual bool seperator_is_key_plus_seq() { return true; } | |
7c673cae FG |
105 | |
106 | protected: | |
107 | const InternalKeyComparator* comparator_; | |
11fdf7f2 TL |
108 | // Set after ::Finish is called |
109 | size_t index_size_ = 0; | |
7c673cae FG |
110 | }; |
111 | ||
112 | // This index builder builds space-efficient index block. | |
113 | // | |
114 | // Optimizations: | |
115 | // 1. Made block's `block_restart_interval` to be 1, which will avoid linear | |
116 | // search when doing index lookup (can be disabled by setting | |
117 | // index_block_restart_interval). | |
118 | // 2. Shorten the key length for index block. Other than honestly using the | |
119 | // last key in the data block as the index key, we instead find a shortest | |
120 | // substitute key that serves the same function. | |
121 | class ShortenedIndexBuilder : public IndexBuilder { | |
122 | public: | |
f67539c2 TL |
123 | explicit ShortenedIndexBuilder( |
124 | const InternalKeyComparator* comparator, | |
125 | const int index_block_restart_interval, const uint32_t format_version, | |
126 | const bool use_value_delta_encoding, | |
127 | BlockBasedTableOptions::IndexShorteningMode shortening_mode, | |
128 | bool include_first_key) | |
7c673cae | 129 | : IndexBuilder(comparator), |
11fdf7f2 TL |
130 | index_block_builder_(index_block_restart_interval, |
131 | true /*use_delta_encoding*/, | |
132 | use_value_delta_encoding), | |
133 | index_block_builder_without_seq_(index_block_restart_interval, | |
134 | true /*use_delta_encoding*/, | |
f67539c2 TL |
135 | use_value_delta_encoding), |
136 | use_value_delta_encoding_(use_value_delta_encoding), | |
137 | include_first_key_(include_first_key), | |
138 | shortening_mode_(shortening_mode) { | |
11fdf7f2 TL |
139 | // Making the default true will disable the feature for old versions |
140 | seperator_is_key_plus_seq_ = (format_version <= 2); | |
141 | } | |
7c673cae | 142 | |
f67539c2 TL |
143 | virtual void OnKeyAdded(const Slice& key) override { |
144 | if (include_first_key_ && current_block_first_internal_key_.empty()) { | |
145 | current_block_first_internal_key_.assign(key.data(), key.size()); | |
146 | } | |
147 | } | |
148 | ||
7c673cae FG |
149 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
150 | const Slice* first_key_in_next_block, | |
151 | const BlockHandle& block_handle) override { | |
152 | if (first_key_in_next_block != nullptr) { | |
f67539c2 TL |
153 | if (shortening_mode_ != |
154 | BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { | |
1e59de90 TL |
155 | FindShortestInternalKeySeparator(*comparator_->user_comparator(), |
156 | last_key_in_current_block, | |
157 | *first_key_in_next_block); | |
f67539c2 | 158 | } |
11fdf7f2 TL |
159 | if (!seperator_is_key_plus_seq_ && |
160 | comparator_->user_comparator()->Compare( | |
161 | ExtractUserKey(*last_key_in_current_block), | |
162 | ExtractUserKey(*first_key_in_next_block)) == 0) { | |
163 | seperator_is_key_plus_seq_ = true; | |
164 | } | |
7c673cae | 165 | } else { |
f67539c2 TL |
166 | if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode:: |
167 | kShortenSeparatorsAndSuccessor) { | |
1e59de90 TL |
168 | FindShortInternalKeySuccessor(*comparator_->user_comparator(), |
169 | last_key_in_current_block); | |
f67539c2 | 170 | } |
7c673cae | 171 | } |
11fdf7f2 | 172 | auto sep = Slice(*last_key_in_current_block); |
7c673cae | 173 | |
f67539c2 TL |
174 | assert(!include_first_key_ || !current_block_first_internal_key_.empty()); |
175 | IndexValue entry(block_handle, current_block_first_internal_key_); | |
176 | std::string encoded_entry; | |
177 | std::string delta_encoded_entry; | |
178 | entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); | |
179 | if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { | |
180 | entry.EncodeTo(&delta_encoded_entry, include_first_key_, | |
181 | &last_encoded_handle_); | |
182 | } else { | |
183 | // If it's the first block, or delta encoding is disabled, | |
184 | // BlockBuilder::Add() below won't use delta-encoded slice. | |
185 | } | |
11fdf7f2 | 186 | last_encoded_handle_ = block_handle; |
f67539c2 TL |
187 | const Slice delta_encoded_entry_slice(delta_encoded_entry); |
188 | index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); | |
11fdf7f2 | 189 | if (!seperator_is_key_plus_seq_) { |
f67539c2 TL |
190 | index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, |
191 | &delta_encoded_entry_slice); | |
11fdf7f2 | 192 | } |
f67539c2 TL |
193 | |
194 | current_block_first_internal_key_.clear(); | |
7c673cae FG |
195 | } |
196 | ||
197 | using IndexBuilder::Finish; | |
198 | virtual Status Finish( | |
199 | IndexBlocks* index_blocks, | |
11fdf7f2 TL |
200 | const BlockHandle& /*last_partition_block_handle*/) override { |
201 | if (seperator_is_key_plus_seq_) { | |
202 | index_blocks->index_block_contents = index_block_builder_.Finish(); | |
203 | } else { | |
204 | index_blocks->index_block_contents = | |
205 | index_block_builder_without_seq_.Finish(); | |
206 | } | |
207 | index_size_ = index_blocks->index_block_contents.size(); | |
7c673cae FG |
208 | return Status::OK(); |
209 | } | |
210 | ||
11fdf7f2 TL |
211 | virtual size_t IndexSize() const override { return index_size_; } |
212 | ||
213 | virtual bool seperator_is_key_plus_seq() override { | |
214 | return seperator_is_key_plus_seq_; | |
7c673cae FG |
215 | } |
216 | ||
1e59de90 TL |
217 | // Changes *key to a short string >= *key. |
218 | // | |
219 | static void FindShortestInternalKeySeparator(const Comparator& comparator, | |
220 | std::string* start, | |
221 | const Slice& limit); | |
222 | ||
223 | static void FindShortInternalKeySuccessor(const Comparator& comparator, | |
224 | std::string* key); | |
225 | ||
7c673cae FG |
226 | friend class PartitionedIndexBuilder; |
227 | ||
228 | private: | |
229 | BlockBuilder index_block_builder_; | |
11fdf7f2 | 230 | BlockBuilder index_block_builder_without_seq_; |
f67539c2 | 231 | const bool use_value_delta_encoding_; |
11fdf7f2 | 232 | bool seperator_is_key_plus_seq_; |
f67539c2 TL |
233 | const bool include_first_key_; |
234 | BlockBasedTableOptions::IndexShorteningMode shortening_mode_; | |
235 | BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); | |
236 | std::string current_block_first_internal_key_; | |
7c673cae FG |
237 | }; |
238 | ||
239 | // HashIndexBuilder contains a binary-searchable primary index and the | |
240 | // metadata for secondary hash index construction. | |
241 | // The metadata for hash index consists two parts: | |
242 | // - a metablock that compactly contains a sequence of prefixes. All prefixes | |
243 | // are stored consectively without any metadata (like, prefix sizes) being | |
244 | // stored, which is kept in the other metablock. | |
245 | // - a metablock contains the metadata of the prefixes, including prefix size, | |
246 | // restart index and number of block it spans. The format looks like: | |
247 | // | |
248 | // +-----------------+---------------------------+---------------------+ | |
249 | // <=prefix 1 | |
250 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
251 | // +-----------------+---------------------------+---------------------+ | |
252 | // <=prefix 2 | |
253 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
254 | // +-----------------+---------------------------+---------------------+ | |
255 | // | | | |
256 | // | .... | | |
257 | // | | | |
258 | // +-----------------+---------------------------+---------------------+ | |
259 | // <=prefix n | |
260 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
261 | // +-----------------+---------------------------+---------------------+ | |
262 | // | |
263 | // The reason of separating these two metablocks is to enable the efficiently | |
264 | // reuse the first metablock during hash index construction without unnecessary | |
265 | // data copy or small heap allocations for prefixes. | |
266 | class HashIndexBuilder : public IndexBuilder { | |
267 | public: | |
f67539c2 TL |
268 | explicit HashIndexBuilder( |
269 | const InternalKeyComparator* comparator, | |
270 | const SliceTransform* hash_key_extractor, | |
271 | int index_block_restart_interval, int format_version, | |
272 | bool use_value_delta_encoding, | |
273 | BlockBasedTableOptions::IndexShorteningMode shortening_mode) | |
7c673cae | 274 | : IndexBuilder(comparator), |
11fdf7f2 | 275 | primary_index_builder_(comparator, index_block_restart_interval, |
f67539c2 TL |
276 | format_version, use_value_delta_encoding, |
277 | shortening_mode, /* include_first_key */ false), | |
7c673cae FG |
278 | hash_key_extractor_(hash_key_extractor) {} |
279 | ||
280 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
281 | const Slice* first_key_in_next_block, | |
282 | const BlockHandle& block_handle) override { | |
283 | ++current_restart_index_; | |
284 | primary_index_builder_.AddIndexEntry(last_key_in_current_block, | |
285 | first_key_in_next_block, block_handle); | |
286 | } | |
287 | ||
288 | virtual void OnKeyAdded(const Slice& key) override { | |
289 | auto key_prefix = hash_key_extractor_->Transform(key); | |
290 | bool is_first_entry = pending_block_num_ == 0; | |
291 | ||
292 | // Keys may share the prefix | |
293 | if (is_first_entry || pending_entry_prefix_ != key_prefix) { | |
294 | if (!is_first_entry) { | |
295 | FlushPendingPrefix(); | |
296 | } | |
297 | ||
298 | // need a hard copy otherwise the underlying data changes all the time. | |
1e59de90 TL |
299 | // TODO(kailiu) std::to_string() is expensive. We may speed up can avoid |
300 | // data copy. | |
7c673cae FG |
301 | pending_entry_prefix_ = key_prefix.ToString(); |
302 | pending_block_num_ = 1; | |
303 | pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); | |
304 | } else { | |
305 | // entry number increments when keys share the prefix reside in | |
306 | // different data blocks. | |
307 | auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; | |
308 | assert(last_restart_index <= current_restart_index_); | |
309 | if (last_restart_index != current_restart_index_) { | |
310 | ++pending_block_num_; | |
311 | } | |
312 | } | |
313 | } | |
314 | ||
315 | virtual Status Finish( | |
316 | IndexBlocks* index_blocks, | |
317 | const BlockHandle& last_partition_block_handle) override { | |
11fdf7f2 TL |
318 | if (pending_block_num_ != 0) { |
319 | FlushPendingPrefix(); | |
320 | } | |
20effc67 TL |
321 | Status s = primary_index_builder_.Finish(index_blocks, |
322 | last_partition_block_handle); | |
7c673cae FG |
323 | index_blocks->meta_blocks.insert( |
324 | {kHashIndexPrefixesBlock.c_str(), prefix_block_}); | |
325 | index_blocks->meta_blocks.insert( | |
326 | {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); | |
20effc67 | 327 | return s; |
7c673cae FG |
328 | } |
329 | ||
11fdf7f2 TL |
330 | virtual size_t IndexSize() const override { |
331 | return primary_index_builder_.IndexSize() + prefix_block_.size() + | |
7c673cae FG |
332 | prefix_meta_block_.size(); |
333 | } | |
334 | ||
11fdf7f2 TL |
335 | virtual bool seperator_is_key_plus_seq() override { |
336 | return primary_index_builder_.seperator_is_key_plus_seq(); | |
337 | } | |
338 | ||
7c673cae FG |
339 | private: |
340 | void FlushPendingPrefix() { | |
341 | prefix_block_.append(pending_entry_prefix_.data(), | |
342 | pending_entry_prefix_.size()); | |
343 | PutVarint32Varint32Varint32( | |
344 | &prefix_meta_block_, | |
345 | static_cast<uint32_t>(pending_entry_prefix_.size()), | |
346 | pending_entry_index_, pending_block_num_); | |
347 | } | |
348 | ||
349 | ShortenedIndexBuilder primary_index_builder_; | |
350 | const SliceTransform* hash_key_extractor_; | |
351 | ||
352 | // stores a sequence of prefixes | |
353 | std::string prefix_block_; | |
354 | // stores the metadata of prefixes | |
355 | std::string prefix_meta_block_; | |
356 | ||
357 | // The following 3 variables keeps unflushed prefix and its metadata. | |
358 | // The details of block_num and entry_index can be found in | |
359 | // "block_hash_index.{h,cc}" | |
360 | uint32_t pending_block_num_ = 0; | |
361 | uint32_t pending_entry_index_ = 0; | |
362 | std::string pending_entry_prefix_; | |
363 | ||
364 | uint64_t current_restart_index_ = 0; | |
365 | }; | |
366 | ||
367 | /** | |
368 | * IndexBuilder for two-level indexing. Internally it creates a new index for | |
369 | * each partition and Finish then in order when Finish is called on it | |
370 | * continiously until Status::OK() is returned. | |
371 | * | |
372 | * The format on the disk would be I I I I I I IP where I is block containing a | |
373 | * partition of indexes built using ShortenedIndexBuilder and IP is a block | |
374 | * containing a secondary index on the partitions, built using | |
375 | * ShortenedIndexBuilder. | |
376 | */ | |
377 | class PartitionedIndexBuilder : public IndexBuilder { | |
378 | public: | |
379 | static PartitionedIndexBuilder* CreateIndexBuilder( | |
f67539c2 | 380 | const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, |
11fdf7f2 | 381 | const bool use_value_delta_encoding, |
7c673cae FG |
382 | const BlockBasedTableOptions& table_opt); |
383 | ||
384 | explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, | |
11fdf7f2 TL |
385 | const BlockBasedTableOptions& table_opt, |
386 | const bool use_value_delta_encoding); | |
7c673cae FG |
387 | |
388 | virtual ~PartitionedIndexBuilder(); | |
389 | ||
390 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
391 | const Slice* first_key_in_next_block, | |
392 | const BlockHandle& block_handle) override; | |
393 | ||
394 | virtual Status Finish( | |
395 | IndexBlocks* index_blocks, | |
396 | const BlockHandle& last_partition_block_handle) override; | |
397 | ||
11fdf7f2 TL |
398 | virtual size_t IndexSize() const override { return index_size_; } |
399 | size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } | |
400 | size_t NumPartitions() const; | |
7c673cae FG |
401 | |
402 | inline bool ShouldCutFilterBlock() { | |
403 | // Current policy is to align the partitions of index and filters | |
404 | if (cut_filter_block) { | |
405 | cut_filter_block = false; | |
406 | return true; | |
407 | } | |
408 | return false; | |
409 | } | |
410 | ||
411 | std::string& GetPartitionKey() { return sub_index_last_key_; } | |
412 | ||
11fdf7f2 TL |
413 | // Called when an external entity (such as filter partition builder) request |
414 | // cutting the next partition | |
415 | void RequestPartitionCut(); | |
416 | ||
417 | virtual bool seperator_is_key_plus_seq() override { | |
418 | return seperator_is_key_plus_seq_; | |
419 | } | |
420 | ||
421 | bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } | |
422 | ||
7c673cae | 423 | private: |
11fdf7f2 TL |
424 | // Set after ::Finish is called |
425 | size_t top_level_index_size_ = 0; | |
426 | // Set after ::Finish is called | |
427 | size_t partition_cnt_ = 0; | |
428 | ||
7c673cae FG |
429 | void MakeNewSubIndexBuilder(); |
430 | ||
431 | struct Entry { | |
432 | std::string key; | |
433 | std::unique_ptr<ShortenedIndexBuilder> value; | |
434 | }; | |
435 | std::list<Entry> entries_; // list of partitioned indexes and their keys | |
f67539c2 | 436 | BlockBuilder index_block_builder_; // top-level index builder |
11fdf7f2 | 437 | BlockBuilder index_block_builder_without_seq_; // same for user keys |
7c673cae FG |
438 | // the active partition index builder |
439 | ShortenedIndexBuilder* sub_index_builder_; | |
440 | // the last key in the active partition index builder | |
441 | std::string sub_index_last_key_; | |
442 | std::unique_ptr<FlushBlockPolicy> flush_policy_; | |
443 | // true if Finish is called once but not complete yet. | |
444 | bool finishing_indexes = false; | |
445 | const BlockBasedTableOptions& table_opt_; | |
11fdf7f2 TL |
446 | bool seperator_is_key_plus_seq_; |
447 | bool use_value_delta_encoding_; | |
448 | // true if an external entity (such as filter partition builder) request | |
449 | // cutting the next partition | |
450 | bool partition_cut_requested_ = true; | |
7c673cae FG |
451 | // true if it should cut the next filter partition block |
452 | bool cut_filter_block = false; | |
11fdf7f2 | 453 | BlockHandle last_encoded_handle_; |
7c673cae | 454 | }; |
f67539c2 | 455 | } // namespace ROCKSDB_NAMESPACE |