]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
12 | #include <assert.h> | |
13 | #include <inttypes.h> | |
14 | ||
15 | #include <list> | |
16 | #include <string> | |
17 | #include <unordered_map> | |
18 | ||
19 | #include "rocksdb/comparator.h" | |
20 | #include "table/block_based_table_factory.h" | |
21 | #include "table/block_builder.h" | |
22 | #include "table/format.h" | |
23 | ||
24 | namespace rocksdb { | |
25 | // The interface for building index. | |
26 | // Instruction for adding a new concrete IndexBuilder: | |
27 | // 1. Create a subclass instantiated from IndexBuilder. | |
28 | // 2. Add a new entry associated with that subclass in TableOptions::IndexType. | |
29 | // 3. Add a create function for the new subclass in CreateIndexBuilder. | |
30 | // Note: we can devise more advanced design to simplify the process for adding | |
31 | // new subclass, which will, on the other hand, increase the code complexity and | |
32 | // catch unwanted attention from readers. Given that we won't add/change | |
33 | // indexes frequently, it makes sense to just embrace a more straightforward | |
34 | // design that just works. | |
35 | class IndexBuilder { | |
36 | public: | |
37 | static IndexBuilder* CreateIndexBuilder( | |
38 | BlockBasedTableOptions::IndexType index_type, | |
39 | const rocksdb::InternalKeyComparator* comparator, | |
40 | const InternalKeySliceTransform* int_key_slice_transform, | |
11fdf7f2 | 41 | const bool use_value_delta_encoding, |
7c673cae FG |
42 | const BlockBasedTableOptions& table_opt); |
43 | ||
44 | // Index builder will construct a set of blocks which contain: | |
45 | // 1. One primary index block. | |
46 | // 2. (Optional) a set of metablocks that contains the metadata of the | |
47 | // primary index. | |
48 | struct IndexBlocks { | |
49 | Slice index_block_contents; | |
50 | std::unordered_map<std::string, Slice> meta_blocks; | |
51 | }; | |
52 | explicit IndexBuilder(const InternalKeyComparator* comparator) | |
53 | : comparator_(comparator) {} | |
54 | ||
55 | virtual ~IndexBuilder() {} | |
56 | ||
57 | // Add a new index entry to index block. | |
58 | // To allow further optimization, we provide `last_key_in_current_block` and | |
59 | // `first_key_in_next_block`, based on which the specific implementation can | |
60 | // determine the best index key to be used for the index block. | |
61 | // @last_key_in_current_block: this parameter maybe overridden with the value | |
62 | // "substitute key". | |
63 | // @first_key_in_next_block: it will be nullptr if the entry being added is | |
64 | // the last one in the table | |
65 | // | |
66 | // REQUIRES: Finish() has not yet been called. | |
67 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
68 | const Slice* first_key_in_next_block, | |
69 | const BlockHandle& block_handle) = 0; | |
70 | ||
71 | // This method will be called whenever a key is added. The subclasses may | |
72 | // override OnKeyAdded() if they need to collect additional information. | |
11fdf7f2 | 73 | virtual void OnKeyAdded(const Slice& /*key*/) {} |
7c673cae FG |
74 | |
75 | // Inform the index builder that all entries has been written. Block builder | |
76 | // may therefore perform any operation required for block finalization. | |
77 | // | |
78 | // REQUIRES: Finish() has not yet been called. | |
79 | inline Status Finish(IndexBlocks* index_blocks) { | |
80 | // Throw away the changes to last_partition_block_handle. It has no effect | |
81 | // on the first call to Finish anyway. | |
82 | BlockHandle last_partition_block_handle; | |
83 | return Finish(index_blocks, last_partition_block_handle); | |
84 | } | |
85 | ||
86 | // This override of Finish can be utilized to build the 2nd level index in | |
87 | // PartitionIndexBuilder. | |
88 | // | |
89 | // index_blocks will be filled with the resulting index data. If the return | |
90 | // value is Status::InComplete() then it means that the index is partitioned | |
91 | // and the callee should keep calling Finish until Status::OK() is returned. | |
92 | // In that case, last_partition_block_handle is pointer to the block written | |
93 | // with the result of the last call to Finish. This can be utilized to build | |
94 | // the second level index pointing to each block of partitioned indexes. The | |
95 | // last call to Finish() that returns Status::OK() populates index_blocks with | |
96 | // the 2nd level index content. | |
97 | virtual Status Finish(IndexBlocks* index_blocks, | |
98 | const BlockHandle& last_partition_block_handle) = 0; | |
99 | ||
11fdf7f2 TL |
100 | // Get the size for index block. Must be called after ::Finish. |
101 | virtual size_t IndexSize() const = 0; | |
102 | ||
103 | virtual bool seperator_is_key_plus_seq() { return true; } | |
7c673cae FG |
104 | |
105 | protected: | |
106 | const InternalKeyComparator* comparator_; | |
11fdf7f2 TL |
107 | // Set after ::Finish is called |
108 | size_t index_size_ = 0; | |
7c673cae FG |
109 | }; |
110 | ||
111 | // This index builder builds space-efficient index block. | |
112 | // | |
113 | // Optimizations: | |
114 | // 1. Made block's `block_restart_interval` to be 1, which will avoid linear | |
115 | // search when doing index lookup (can be disabled by setting | |
116 | // index_block_restart_interval). | |
117 | // 2. Shorten the key length for index block. Other than honestly using the | |
118 | // last key in the data block as the index key, we instead find a shortest | |
119 | // substitute key that serves the same function. | |
120 | class ShortenedIndexBuilder : public IndexBuilder { | |
121 | public: | |
122 | explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, | |
11fdf7f2 TL |
123 | const int index_block_restart_interval, |
124 | const uint32_t format_version, | |
125 | const bool use_value_delta_encoding) | |
7c673cae | 126 | : IndexBuilder(comparator), |
11fdf7f2 TL |
127 | index_block_builder_(index_block_restart_interval, |
128 | true /*use_delta_encoding*/, | |
129 | use_value_delta_encoding), | |
130 | index_block_builder_without_seq_(index_block_restart_interval, | |
131 | true /*use_delta_encoding*/, | |
132 | use_value_delta_encoding) { | |
133 | // Making the default true will disable the feature for old versions | |
134 | seperator_is_key_plus_seq_ = (format_version <= 2); | |
135 | } | |
7c673cae FG |
136 | |
137 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
138 | const Slice* first_key_in_next_block, | |
139 | const BlockHandle& block_handle) override { | |
140 | if (first_key_in_next_block != nullptr) { | |
141 | comparator_->FindShortestSeparator(last_key_in_current_block, | |
142 | *first_key_in_next_block); | |
11fdf7f2 TL |
143 | if (!seperator_is_key_plus_seq_ && |
144 | comparator_->user_comparator()->Compare( | |
145 | ExtractUserKey(*last_key_in_current_block), | |
146 | ExtractUserKey(*first_key_in_next_block)) == 0) { | |
147 | seperator_is_key_plus_seq_ = true; | |
148 | } | |
7c673cae FG |
149 | } else { |
150 | comparator_->FindShortSuccessor(last_key_in_current_block); | |
151 | } | |
11fdf7f2 | 152 | auto sep = Slice(*last_key_in_current_block); |
7c673cae FG |
153 | |
154 | std::string handle_encoding; | |
155 | block_handle.EncodeTo(&handle_encoding); | |
11fdf7f2 TL |
156 | std::string handle_delta_encoding; |
157 | PutVarsignedint64(&handle_delta_encoding, | |
158 | block_handle.size() - last_encoded_handle_.size()); | |
159 | assert(handle_delta_encoding.size() != 0); | |
160 | last_encoded_handle_ = block_handle; | |
161 | const Slice handle_delta_encoding_slice(handle_delta_encoding); | |
162 | index_block_builder_.Add(sep, handle_encoding, | |
163 | &handle_delta_encoding_slice); | |
164 | if (!seperator_is_key_plus_seq_) { | |
165 | index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, | |
166 | &handle_delta_encoding_slice); | |
167 | } | |
7c673cae FG |
168 | } |
169 | ||
170 | using IndexBuilder::Finish; | |
171 | virtual Status Finish( | |
172 | IndexBlocks* index_blocks, | |
11fdf7f2 TL |
173 | const BlockHandle& /*last_partition_block_handle*/) override { |
174 | if (seperator_is_key_plus_seq_) { | |
175 | index_blocks->index_block_contents = index_block_builder_.Finish(); | |
176 | } else { | |
177 | index_blocks->index_block_contents = | |
178 | index_block_builder_without_seq_.Finish(); | |
179 | } | |
180 | index_size_ = index_blocks->index_block_contents.size(); | |
7c673cae FG |
181 | return Status::OK(); |
182 | } | |
183 | ||
11fdf7f2 TL |
184 | virtual size_t IndexSize() const override { return index_size_; } |
185 | ||
186 | virtual bool seperator_is_key_plus_seq() override { | |
187 | return seperator_is_key_plus_seq_; | |
7c673cae FG |
188 | } |
189 | ||
190 | friend class PartitionedIndexBuilder; | |
191 | ||
192 | private: | |
193 | BlockBuilder index_block_builder_; | |
11fdf7f2 TL |
194 | BlockBuilder index_block_builder_without_seq_; |
195 | bool seperator_is_key_plus_seq_; | |
196 | BlockHandle last_encoded_handle_; | |
7c673cae FG |
197 | }; |
198 | ||
199 | // HashIndexBuilder contains a binary-searchable primary index and the | |
200 | // metadata for secondary hash index construction. | |
201 | // The metadata for hash index consists two parts: | |
202 | // - a metablock that compactly contains a sequence of prefixes. All prefixes | |
203 | // are stored consectively without any metadata (like, prefix sizes) being | |
204 | // stored, which is kept in the other metablock. | |
205 | // - a metablock contains the metadata of the prefixes, including prefix size, | |
206 | // restart index and number of block it spans. The format looks like: | |
207 | // | |
208 | // +-----------------+---------------------------+---------------------+ | |
209 | // <=prefix 1 | |
210 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
211 | // +-----------------+---------------------------+---------------------+ | |
212 | // <=prefix 2 | |
213 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
214 | // +-----------------+---------------------------+---------------------+ | |
215 | // | | | |
216 | // | .... | | |
217 | // | | | |
218 | // +-----------------+---------------------------+---------------------+ | |
219 | // <=prefix n | |
220 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | | |
221 | // +-----------------+---------------------------+---------------------+ | |
222 | // | |
223 | // The reason of separating these two metablocks is to enable the efficiently | |
224 | // reuse the first metablock during hash index construction without unnecessary | |
225 | // data copy or small heap allocations for prefixes. | |
226 | class HashIndexBuilder : public IndexBuilder { | |
227 | public: | |
228 | explicit HashIndexBuilder(const InternalKeyComparator* comparator, | |
229 | const SliceTransform* hash_key_extractor, | |
11fdf7f2 TL |
230 | int index_block_restart_interval, |
231 | int format_version, bool use_value_delta_encoding) | |
7c673cae | 232 | : IndexBuilder(comparator), |
11fdf7f2 TL |
233 | primary_index_builder_(comparator, index_block_restart_interval, |
234 | format_version, use_value_delta_encoding), | |
7c673cae FG |
235 | hash_key_extractor_(hash_key_extractor) {} |
236 | ||
237 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
238 | const Slice* first_key_in_next_block, | |
239 | const BlockHandle& block_handle) override { | |
240 | ++current_restart_index_; | |
241 | primary_index_builder_.AddIndexEntry(last_key_in_current_block, | |
242 | first_key_in_next_block, block_handle); | |
243 | } | |
244 | ||
245 | virtual void OnKeyAdded(const Slice& key) override { | |
246 | auto key_prefix = hash_key_extractor_->Transform(key); | |
247 | bool is_first_entry = pending_block_num_ == 0; | |
248 | ||
249 | // Keys may share the prefix | |
250 | if (is_first_entry || pending_entry_prefix_ != key_prefix) { | |
251 | if (!is_first_entry) { | |
252 | FlushPendingPrefix(); | |
253 | } | |
254 | ||
255 | // need a hard copy otherwise the underlying data changes all the time. | |
256 | // TODO(kailiu) ToString() is expensive. We may speed up can avoid data | |
257 | // copy. | |
258 | pending_entry_prefix_ = key_prefix.ToString(); | |
259 | pending_block_num_ = 1; | |
260 | pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); | |
261 | } else { | |
262 | // entry number increments when keys share the prefix reside in | |
263 | // different data blocks. | |
264 | auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; | |
265 | assert(last_restart_index <= current_restart_index_); | |
266 | if (last_restart_index != current_restart_index_) { | |
267 | ++pending_block_num_; | |
268 | } | |
269 | } | |
270 | } | |
271 | ||
272 | virtual Status Finish( | |
273 | IndexBlocks* index_blocks, | |
274 | const BlockHandle& last_partition_block_handle) override { | |
11fdf7f2 TL |
275 | if (pending_block_num_ != 0) { |
276 | FlushPendingPrefix(); | |
277 | } | |
7c673cae FG |
278 | primary_index_builder_.Finish(index_blocks, last_partition_block_handle); |
279 | index_blocks->meta_blocks.insert( | |
280 | {kHashIndexPrefixesBlock.c_str(), prefix_block_}); | |
281 | index_blocks->meta_blocks.insert( | |
282 | {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); | |
283 | return Status::OK(); | |
284 | } | |
285 | ||
11fdf7f2 TL |
286 | virtual size_t IndexSize() const override { |
287 | return primary_index_builder_.IndexSize() + prefix_block_.size() + | |
7c673cae FG |
288 | prefix_meta_block_.size(); |
289 | } | |
290 | ||
11fdf7f2 TL |
291 | virtual bool seperator_is_key_plus_seq() override { |
292 | return primary_index_builder_.seperator_is_key_plus_seq(); | |
293 | } | |
294 | ||
7c673cae FG |
295 | private: |
296 | void FlushPendingPrefix() { | |
297 | prefix_block_.append(pending_entry_prefix_.data(), | |
298 | pending_entry_prefix_.size()); | |
299 | PutVarint32Varint32Varint32( | |
300 | &prefix_meta_block_, | |
301 | static_cast<uint32_t>(pending_entry_prefix_.size()), | |
302 | pending_entry_index_, pending_block_num_); | |
303 | } | |
304 | ||
305 | ShortenedIndexBuilder primary_index_builder_; | |
306 | const SliceTransform* hash_key_extractor_; | |
307 | ||
308 | // stores a sequence of prefixes | |
309 | std::string prefix_block_; | |
310 | // stores the metadata of prefixes | |
311 | std::string prefix_meta_block_; | |
312 | ||
313 | // The following 3 variables keeps unflushed prefix and its metadata. | |
314 | // The details of block_num and entry_index can be found in | |
315 | // "block_hash_index.{h,cc}" | |
316 | uint32_t pending_block_num_ = 0; | |
317 | uint32_t pending_entry_index_ = 0; | |
318 | std::string pending_entry_prefix_; | |
319 | ||
320 | uint64_t current_restart_index_ = 0; | |
321 | }; | |
322 | ||
323 | /** | |
324 | * IndexBuilder for two-level indexing. Internally it creates a new index for | |
325 | * each partition and Finish then in order when Finish is called on it | |
326 | * continiously until Status::OK() is returned. | |
327 | * | |
328 | * The format on the disk would be I I I I I I IP where I is block containing a | |
329 | * partition of indexes built using ShortenedIndexBuilder and IP is a block | |
330 | * containing a secondary index on the partitions, built using | |
331 | * ShortenedIndexBuilder. | |
332 | */ | |
333 | class PartitionedIndexBuilder : public IndexBuilder { | |
334 | public: | |
335 | static PartitionedIndexBuilder* CreateIndexBuilder( | |
336 | const rocksdb::InternalKeyComparator* comparator, | |
11fdf7f2 | 337 | const bool use_value_delta_encoding, |
7c673cae FG |
338 | const BlockBasedTableOptions& table_opt); |
339 | ||
340 | explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, | |
11fdf7f2 TL |
341 | const BlockBasedTableOptions& table_opt, |
342 | const bool use_value_delta_encoding); | |
7c673cae FG |
343 | |
344 | virtual ~PartitionedIndexBuilder(); | |
345 | ||
346 | virtual void AddIndexEntry(std::string* last_key_in_current_block, | |
347 | const Slice* first_key_in_next_block, | |
348 | const BlockHandle& block_handle) override; | |
349 | ||
350 | virtual Status Finish( | |
351 | IndexBlocks* index_blocks, | |
352 | const BlockHandle& last_partition_block_handle) override; | |
353 | ||
11fdf7f2 TL |
354 | virtual size_t IndexSize() const override { return index_size_; } |
355 | size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } | |
356 | size_t NumPartitions() const; | |
7c673cae FG |
357 | |
358 | inline bool ShouldCutFilterBlock() { | |
359 | // Current policy is to align the partitions of index and filters | |
360 | if (cut_filter_block) { | |
361 | cut_filter_block = false; | |
362 | return true; | |
363 | } | |
364 | return false; | |
365 | } | |
366 | ||
367 | std::string& GetPartitionKey() { return sub_index_last_key_; } | |
368 | ||
11fdf7f2 TL |
369 | // Called when an external entity (such as filter partition builder) request |
370 | // cutting the next partition | |
371 | void RequestPartitionCut(); | |
372 | ||
373 | virtual bool seperator_is_key_plus_seq() override { | |
374 | return seperator_is_key_plus_seq_; | |
375 | } | |
376 | ||
377 | bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } | |
378 | ||
7c673cae | 379 | private: |
11fdf7f2 TL |
380 | // Set after ::Finish is called |
381 | size_t top_level_index_size_ = 0; | |
382 | // Set after ::Finish is called | |
383 | size_t partition_cnt_ = 0; | |
384 | ||
7c673cae FG |
385 | void MakeNewSubIndexBuilder(); |
386 | ||
387 | struct Entry { | |
388 | std::string key; | |
389 | std::unique_ptr<ShortenedIndexBuilder> value; | |
390 | }; | |
391 | std::list<Entry> entries_; // list of partitioned indexes and their keys | |
392 | BlockBuilder index_block_builder_; // top-level index builder | |
11fdf7f2 | 393 | BlockBuilder index_block_builder_without_seq_; // same for user keys |
7c673cae FG |
394 | // the active partition index builder |
395 | ShortenedIndexBuilder* sub_index_builder_; | |
396 | // the last key in the active partition index builder | |
397 | std::string sub_index_last_key_; | |
398 | std::unique_ptr<FlushBlockPolicy> flush_policy_; | |
399 | // true if Finish is called once but not complete yet. | |
400 | bool finishing_indexes = false; | |
401 | const BlockBasedTableOptions& table_opt_; | |
11fdf7f2 TL |
402 | bool seperator_is_key_plus_seq_; |
403 | bool use_value_delta_encoding_; | |
404 | // true if an external entity (such as filter partition builder) request | |
405 | // cutting the next partition | |
406 | bool partition_cut_requested_ = true; | |
7c673cae FG |
407 | // true if it should cut the next filter partition block |
408 | bool cut_filter_block = false; | |
11fdf7f2 | 409 | BlockHandle last_encoded_handle_; |
7c673cae FG |
410 | }; |
411 | } // namespace rocksdb |