]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_builder.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / table / block_builder.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // BlockBuilder generates blocks where keys are prefix-compressed:
11 //
12 // When we store a key, we drop the prefix shared with the previous
13 // string. This helps reduce the space requirement significantly.
14 // Furthermore, once every K keys, we do not apply the prefix
15 // compression and store the entire key. We call this a "restart
16 // point". The tail end of the block stores the offsets of all of the
17 // restart points, and can be used to do a binary search when looking
18 // for a particular key. Values are stored as-is (without compression)
19 // immediately following the corresponding key.
20 //
21 // An entry for a particular key-value pair has the form:
22 // shared_bytes: varint32
23 // unshared_bytes: varint32
24 // value_length: varint32
25 // key_delta: char[unshared_bytes]
26 // value: char[value_length]
27 // shared_bytes == 0 for restart points.
28 //
29 // The trailer of the block has the form:
30 // restarts: uint32[num_restarts]
31 // num_restarts: uint32
32 // restarts[i] contains the offset within the block of the ith restart point.
33
34 #include "table/block_builder.h"
35
36 #include <algorithm>
37 #include <assert.h>
38 #include "rocksdb/comparator.h"
39 #include "db/dbformat.h"
40 #include "util/coding.h"
41
42 namespace rocksdb {
43
44 BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding)
45 : block_restart_interval_(block_restart_interval),
46 use_delta_encoding_(use_delta_encoding),
47 restarts_(),
48 counter_(0),
49 finished_(false) {
50 assert(block_restart_interval_ >= 1);
51 restarts_.push_back(0); // First restart point is at offset 0
52 estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
53 }
54
55 void BlockBuilder::Reset() {
56 buffer_.clear();
57 restarts_.clear();
58 restarts_.push_back(0); // First restart point is at offset 0
59 estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
60 counter_ = 0;
61 finished_ = false;
62 last_key_.clear();
63 }
64
65 size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
66 const {
67 size_t estimate = CurrentSizeEstimate();
68 estimate += key.size() + value.size();
69 if (counter_ >= block_restart_interval_) {
70 estimate += sizeof(uint32_t); // a new restart entry.
71 }
72
73 estimate += sizeof(int32_t); // varint for shared prefix length.
74 estimate += VarintLength(key.size()); // varint for key length.
75 estimate += VarintLength(value.size()); // varint for value length.
76
77 return estimate;
78 }
79
80 Slice BlockBuilder::Finish() {
81 // Append restart array
82 for (size_t i = 0; i < restarts_.size(); i++) {
83 PutFixed32(&buffer_, restarts_[i]);
84 }
85 PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
86 finished_ = true;
87 return Slice(buffer_);
88 }
89
90 void BlockBuilder::Add(const Slice& key, const Slice& value) {
91 assert(!finished_);
92 assert(counter_ <= block_restart_interval_);
93 size_t shared = 0; // number of bytes shared with prev key
94 if (counter_ >= block_restart_interval_) {
95 // Restart compression
96 restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
97 estimate_ += sizeof(uint32_t);
98 counter_ = 0;
99
100 if (use_delta_encoding_) {
101 // Update state
102 last_key_.assign(key.data(), key.size());
103 }
104 } else if (use_delta_encoding_) {
105 Slice last_key_piece(last_key_);
106 // See how much sharing to do with previous string
107 shared = key.difference_offset(last_key_piece);
108
109 // Update state
110 // We used to just copy the changed data here, but it appears to be
111 // faster to just copy the whole thing.
112 last_key_.assign(key.data(), key.size());
113 }
114
115 const size_t non_shared = key.size() - shared;
116 const size_t curr_size = buffer_.size();
117
118 // Add "<shared><non_shared><value_size>" to buffer_
119 PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
120 static_cast<uint32_t>(non_shared),
121 static_cast<uint32_t>(value.size()));
122
123 // Add string delta to buffer_ followed by value
124 buffer_.append(key.data() + shared, non_shared);
125 buffer_.append(value.data(), value.size());
126
127 counter_++;
128 estimate_ += buffer_.size() - curr_size;
129 }
130
131 } // namespace rocksdb