1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // BlockBuilder generates blocks where keys are prefix-compressed:
12 // When we store a key, we drop the prefix shared with the previous
13 // string. This helps reduce the space requirement significantly.
14 // Furthermore, once every K keys, we do not apply the prefix
15 // compression and store the entire key. We call this a "restart
16 // point". The tail end of the block stores the offsets of all of the
17 // restart points, and can be used to do a binary search when looking
18 // for a particular key. Values are stored as-is (without compression)
19 // immediately following the corresponding key.
21 // An entry for a particular key-value pair has the form:
22 // shared_bytes: varint32
23 // unshared_bytes: varint32
24 // value_length: varint32
25 // key_delta: char[unshared_bytes]
26 // value: char[value_length]
27 // shared_bytes == 0 for restart points.
29 // The trailer of the block has the form:
30 // restarts: uint32[num_restarts]
31 // num_restarts: uint32
32 // restarts[i] contains the offset within the block of the ith restart point.
34 #include "table/block_builder.h"
38 #include "db/dbformat.h"
39 #include "rocksdb/comparator.h"
40 #include "table/data_block_footer.h"
41 #include "util/coding.h"
45 BlockBuilder::BlockBuilder(
46 int block_restart_interval
, bool use_delta_encoding
,
47 bool use_value_delta_encoding
,
48 BlockBasedTableOptions::DataBlockIndexType index_type
,
49 double data_block_hash_table_util_ratio
)
50 : block_restart_interval_(block_restart_interval
),
51 use_delta_encoding_(use_delta_encoding
),
52 use_value_delta_encoding_(use_value_delta_encoding
),
57 case BlockBasedTableOptions::kDataBlockBinarySearch
:
59 case BlockBasedTableOptions::kDataBlockBinaryAndHash
:
60 data_block_hash_index_builder_
.Initialize(
61 data_block_hash_table_util_ratio
);
66 assert(block_restart_interval_
>= 1);
67 restarts_
.push_back(0); // First restart point is at offset 0
68 estimate_
= sizeof(uint32_t) + sizeof(uint32_t);
71 void BlockBuilder::Reset() {
74 restarts_
.push_back(0); // First restart point is at offset 0
75 estimate_
= sizeof(uint32_t) + sizeof(uint32_t);
79 if (data_block_hash_index_builder_
.Valid()) {
80 data_block_hash_index_builder_
.Reset();
84 size_t BlockBuilder::EstimateSizeAfterKV(const Slice
& key
,
85 const Slice
& value
) const {
86 size_t estimate
= CurrentSizeEstimate();
87 // Note: this is an imprecise estimate as it accounts for the whole key size
88 // instead of non-shared key size.
89 estimate
+= key
.size();
90 // In value delta encoding we estimate the value delta size as half the full
91 // value size since only the size field of block handle is encoded.
93 !use_value_delta_encoding_
|| (counter_
>= block_restart_interval_
)
97 if (counter_
>= block_restart_interval_
) {
98 estimate
+= sizeof(uint32_t); // a new restart entry.
101 estimate
+= sizeof(int32_t); // varint for shared prefix length.
102 // Note: this is an imprecise estimate as we will have to encoded size, one
103 // for shared key and one for non-shared key.
104 estimate
+= VarintLength(key
.size()); // varint for key length.
105 if (!use_value_delta_encoding_
|| (counter_
>= block_restart_interval_
)) {
106 estimate
+= VarintLength(value
.size()); // varint for value length.
112 Slice
BlockBuilder::Finish() {
113 // Append restart array
114 for (size_t i
= 0; i
< restarts_
.size(); i
++) {
115 PutFixed32(&buffer_
, restarts_
[i
]);
118 uint32_t num_restarts
= static_cast<uint32_t>(restarts_
.size());
119 BlockBasedTableOptions::DataBlockIndexType index_type
=
120 BlockBasedTableOptions::kDataBlockBinarySearch
;
121 if (data_block_hash_index_builder_
.Valid() &&
122 CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex
) {
123 data_block_hash_index_builder_
.Finish(buffer_
);
124 index_type
= BlockBasedTableOptions::kDataBlockBinaryAndHash
;
127 // footer is a packed format of data_block_index_type and num_restarts
128 uint32_t block_footer
= PackIndexTypeAndNumRestarts(index_type
, num_restarts
);
130 PutFixed32(&buffer_
, block_footer
);
132 return Slice(buffer_
);
135 void BlockBuilder::Add(const Slice
& key
, const Slice
& value
,
136 const Slice
* const delta_value
) {
138 assert(counter_
<= block_restart_interval_
);
139 assert(!use_value_delta_encoding_
|| delta_value
);
140 size_t shared
= 0; // number of bytes shared with prev key
141 if (counter_
>= block_restart_interval_
) {
142 // Restart compression
143 restarts_
.push_back(static_cast<uint32_t>(buffer_
.size()));
144 estimate_
+= sizeof(uint32_t);
147 if (use_delta_encoding_
) {
149 last_key_
.assign(key
.data(), key
.size());
151 } else if (use_delta_encoding_
) {
152 Slice
last_key_piece(last_key_
);
153 // See how much sharing to do with previous string
154 shared
= key
.difference_offset(last_key_piece
);
157 // We used to just copy the changed data here, but it appears to be
158 // faster to just copy the whole thing.
159 last_key_
.assign(key
.data(), key
.size());
162 const size_t non_shared
= key
.size() - shared
;
163 const size_t curr_size
= buffer_
.size();
165 if (use_value_delta_encoding_
) {
166 // Add "<shared><non_shared>" to buffer_
167 PutVarint32Varint32(&buffer_
, static_cast<uint32_t>(shared
),
168 static_cast<uint32_t>(non_shared
));
170 // Add "<shared><non_shared><value_size>" to buffer_
171 PutVarint32Varint32Varint32(&buffer_
, static_cast<uint32_t>(shared
),
172 static_cast<uint32_t>(non_shared
),
173 static_cast<uint32_t>(value
.size()));
176 // Add string delta to buffer_ followed by value
177 buffer_
.append(key
.data() + shared
, non_shared
);
178 // Use value delta encoding only when the key has shared bytes. This would
179 // simplify the decoding, where it can figure which decoding to use simply by
180 // looking at the shared bytes size.
181 if (shared
!= 0 && use_value_delta_encoding_
) {
182 buffer_
.append(delta_value
->data(), delta_value
->size());
184 buffer_
.append(value
.data(), value
.size());
187 if (data_block_hash_index_builder_
.Valid()) {
188 data_block_hash_index_builder_
.Add(ExtractUserKey(key
),
189 restarts_
.size() - 1);
193 estimate_
+= buffer_
.size() - curr_size
;
196 } // namespace rocksdb