1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
11 #include <unordered_map>
13 #include "util/coding.h"
17 enum ColCompressionType
{
28 struct ColDeclaration
;
30 // ColBufEncoder is a class to encode column buffers. It can be populated from a
31 // ColDeclaration. Each time it takes a column value into Append() method to
32 // encode the column and store it into an internal buffer. After all rows for
33 // this column are consumed, a Finish() should be called to add header and
37 // Read a column, encode data and append into internal buffer.
38 virtual size_t Append(const char *buf
) = 0;
39 virtual ~ColBufEncoder() = 0;
40 // Get the internal column buffer. Should only be called after Finish().
41 const std::string
&GetData();
42 // Finish encoding. Add header and remaining data.
43 virtual void Finish() = 0;
44 // Populate a ColBufEncoder from ColDeclaration.
45 static ColBufEncoder
*NewColBufEncoder(const ColDeclaration
&col_declaration
);
49 static inline bool IsRunLength(ColCompressionType type
) {
50 return type
== kColRle
|| type
== kColRleVarint
||
51 type
== kColRleDeltaVarint
|| type
== kColRleDict
;
55 // Encoder for fixed length column buffer. In fixed length column buffer, the
56 // size of the column should not exceed 8 bytes.
57 // The following encodings are supported:
58 // Varint: Variable length integer. See util/coding.h for more details
59 // Rle (Run length encoding): encode a sequence of contiguous value as
60 // [run_value][run_length]. Can be combined with Varint
61 // Delta: Encode value to its delta with its adjacent entry. Use varint to
62 // possibly reduce stored bytes. Can be combined with Rle.
63 // Dictionary: Use a dictionary to record all possible values in the block and
64 // encode them with an ID started from 0. IDs are encoded as varint. A column
65 // with dictionary encoding will have a header to store all actual values,
66 // ordered by their dictionary value, and the data will be replaced by
67 // dictionary value. Can be combined with Rle.
68 class FixedLengthColBufEncoder
: public ColBufEncoder
{
70 explicit FixedLengthColBufEncoder(
71 size_t size
, ColCompressionType col_compression_type
= kColNoCompression
,
72 bool nullable
= false, bool big_endian
= false)
74 col_compression_type_(col_compression_type
),
76 big_endian_(big_endian
),
81 size_t Append(const char *buf
) override
;
82 void Finish() override
;
83 ~FixedLengthColBufEncoder() {}
87 ColCompressionType col_compression_type_
;
88 // If set as true, the input value can be null (represented as nullptr). When
89 // nullable is true, use one more byte before actual value to indicate if the
90 // current value is null.
92 // If set as true, input value will be treated as big endian encoded.
99 // Map to store dictionary for dictionary encoding
100 std::unordered_map
<uint64_t, uint64_t> dictionary_
;
101 // Vector of dictionary keys.
102 std::vector
<uint64_t> dict_vec_
;
105 // Long fixed length column buffer is a variant of fixed length buffer to hold
106 // fixed length buffer with more than 8 bytes. We do not support any special
107 // encoding schemes in LongFixedLengthColBufEncoder.
108 class LongFixedLengthColBufEncoder
: public ColBufEncoder
{
110 LongFixedLengthColBufEncoder(size_t size
, bool nullable
)
111 : size_(size
), nullable_(nullable
) {}
112 size_t Append(const char *buf
) override
;
113 void Finish() override
;
115 ~LongFixedLengthColBufEncoder() {}
122 // Variable length column buffer holds a format of variable length column. In
123 // this format, a column is composed of one byte length k, followed by data with
124 // k bytes long data.
125 class VariableLengthColBufEncoder
: public ColBufEncoder
{
127 size_t Append(const char *buf
) override
;
128 void Finish() override
;
130 ~VariableLengthColBufEncoder() {}
133 // Variable chunk column buffer holds another format of variable length column.
134 // In this format, a column contains multiple chunks of data, each of which is
135 // composed of 8 bytes long data, and one byte as a mask to indicate whether we
136 // have more data to come. If no more data coming, the mask is set as 0xFF. If
137 // the chunk is the last chunk and has only k valid bytes, the mask is set as
139 class VariableChunkColBufEncoder
: public VariableLengthColBufEncoder
{
141 size_t Append(const char *buf
) override
;
142 void Finish() override
;
143 explicit VariableChunkColBufEncoder(ColCompressionType col_compression_type
)
144 : col_compression_type_(col_compression_type
) {}
145 VariableChunkColBufEncoder() : col_compression_type_(kColNoCompression
) {}
148 ColCompressionType col_compression_type_
;
149 // Map to store dictionary for dictionary encoding
150 std::unordered_map
<uint64_t, uint64_t> dictionary_
;
151 // Vector of dictionary keys.
152 std::vector
<uint64_t> dict_vec_
;
155 // ColDeclaration declares a column's type, algorithm of column-aware encoding,
156 // and other column data like endian and nullability.
157 struct ColDeclaration
{
158 explicit ColDeclaration(
159 std::string _col_type
,
160 ColCompressionType _col_compression_type
= kColNoCompression
,
161 size_t _size
= 0, bool _nullable
= false, bool _big_endian
= false)
162 : col_type(_col_type
),
163 col_compression_type(_col_compression_type
),
166 big_endian(_big_endian
) {}
167 std::string col_type
;
168 ColCompressionType col_compression_type
;
174 // KVPairColDeclarations is a class to hold column declaration of columns in
176 struct KVPairColDeclarations
{
177 std::vector
<ColDeclaration
> *key_col_declarations
;
178 std::vector
<ColDeclaration
> *value_col_declarations
;
179 ColDeclaration
*value_checksum_declaration
;
180 KVPairColDeclarations(std::vector
<ColDeclaration
> *_key_col_declarations
,
181 std::vector
<ColDeclaration
> *_value_col_declarations
,
182 ColDeclaration
*_value_checksum_declaration
)
183 : key_col_declarations(_key_col_declarations
),
184 value_col_declarations(_value_col_declarations
),
185 value_checksum_declaration(_value_checksum_declaration
) {}
188 // Similar to KVPairDeclarations, KVPairColBufEncoders is used to hold column
189 // buffer encoders of all columns in key and value.
190 struct KVPairColBufEncoders
{
191 std::vector
<std::unique_ptr
<ColBufEncoder
>> key_col_bufs
;
192 std::vector
<std::unique_ptr
<ColBufEncoder
>> value_col_bufs
;
193 std::unique_ptr
<ColBufEncoder
> value_checksum_buf
;
195 explicit KVPairColBufEncoders(const KVPairColDeclarations
&kvp_cd
) {
196 for (auto kcd
: *kvp_cd
.key_col_declarations
) {
197 key_col_bufs
.emplace_back(
198 std::move(ColBufEncoder::NewColBufEncoder(kcd
)));
200 for (auto vcd
: *kvp_cd
.value_col_declarations
) {
201 value_col_bufs
.emplace_back(
202 std::move(ColBufEncoder::NewColBufEncoder(vcd
)));
204 value_checksum_buf
.reset(
205 ColBufEncoder::NewColBufEncoder(*kvp_cd
.value_checksum_declaration
));
208 // Helper function to call Finish()
210 for (auto &col_buf
: key_col_bufs
) {
213 for (auto &col_buf
: value_col_bufs
) {
216 value_checksum_buf
->Finish();
219 } // namespace rocksdb