1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
13 #include "options/options_helper.h"
14 #include "rocksdb/options.h"
15 #include "rocksdb/table.h"
17 namespace ROCKSDB_NAMESPACE
{
22 class RandomAccessFile
;
27 // PlainTableFactory is the entrance function to the PlainTable format of
28 // SST files. It returns instances PlainTableBuilder as the builder
29 // class and PlainTableReader as the reader class, where the format is
30 // actually implemented.
32 // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
33 // Data is not organized in blocks, which allows fast access. Because of
34 // following downsides
35 // 1. Data compression is not supported.
36 // 2. Data is not checksumed.
37 // it is not recommended to use this format on other type of file systems.
39 // PlainTable requires fixed length key, configured as a constructor
40 // parameter of the factory class. Output file format:
41 // +-------------+-----------------+
42 // | version | user_key_length |
43 // +------------++------------+-----------------+ <= key1 offset
44 // | encoded key1 | value_size | |
45 // +------------+-------------+-------------+ |
48 // +--------------------------+-------------+---+ <= key2 offset
49 // | encoded key2 | value_size | |
50 // +------------+-------------+-------------+ |
54 // +-----------------+--------------------------+
56 // When the key encoding type is kPlain. Key part is encoded as:
57 // +------------+--------------------+
58 // | [key_size] | internal key |
59 // +------------+--------------------+
60 // for the case of user_key_len = kPlainTableVariableLength case,
62 // +----------------------+
64 // +----------------------+
65 // for user_key_len != kPlainTableVariableLength case.
67 // If key encoding type is kPrefix. Keys are encoding in this format.
68 // There are three ways to encode a key:
70 // +---------------+---------------+-------------------+
71 // | Full Key Flag | Full Key Size | Full Internal Key |
72 // +---------------+---------------+-------------------+
73 // which simply encodes a full key
75 // (2) A key shared the same prefix as the previous key, which is encoded as
77 // +-------------+-------------+-------------+-------------+------------+
78 // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
79 // +-------------+-------------+-------------+-------------+------------+
80 // where key is the suffix part of the key, including the internal bytes.
81 // the actual key will be constructed by concatenating prefix part of the
82 // previous key, with the suffix part of the key here, with sizes given here.
84 // (3) A key shared the same prefix as the previous key, which is encoded as
86 // +-----------------+-----------------+------------------------+
87 // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
88 // +-----------------+-----------------+------------------------+
89 // The key will be constructed by concatenating previous key's prefix (which is
90 // also a prefix which the last key encoded in the format of (1)) and the
93 // For example, we for following keys (prefix and suffix are separated by
100 // Will be encoded like this:
106 // (where FK means full key flag, PF means prefix flag and SF means suffix flag)
108 // All those "key flag + key size" shown above are in this format:
109 // The 8 bits of the first byte:
110 // +----+----+----+----+----+----+----+----+
112 // +----+----+----+----+----+----+----+----+
113 // Type indicates: full key, prefix, or suffix.
114 // The last 6 bits are for size. If the size bits are not all 1, it means the
115 // size of the key. Otherwise, varint32 is read after this byte. This varint
116 // value + 0x3F (the value of all 1) will be the key size.
118 // For example, full key with length 16 will be encoded as (binary):
120 // (00 means full key)
121 // and a prefix with 100 bytes will be encoded as:
122 // 01 111111 00100101
124 // (01 means key suffix)
126 // All the internal keys above (including kPlain and kPrefix) are encoded in
128 // There are two types:
129 // (1) normal internal key format
130 // +----------- ...... -------------+----+---+---+---+---+---+---+---+
131 // | user key |type| sequence ID |
132 // +----------- ..... --------------+----+---+---+---+---+---+---+---+
133 // (2) Special case for keys whose sequence ID is 0 and is value type
134 // +----------- ...... -------------+----+
136 // +----------- ..... --------------+----+
137 // To save 7 bytes for the special case where sequence ID = 0.
140 class PlainTableFactory
: public TableFactory
{
142 ~PlainTableFactory() {}
143 // user_key_len is the length of the user key. If it is set to be
144 // kPlainTableVariableLength, then it means variable length. Otherwise, all
145 // the keys need to have the fix length of this value. bloom_bits_per_key is
146 // number of bits used for bloom filer per key. hash_table_ratio is
147 // the desired utilization of the hash table used for prefix hashing.
148 // hash_table_ratio = number of prefixes / #buckets in the hash table
149 // hash_table_ratio = 0 means skip hash table but only replying on binary
151 // index_sparseness determines index interval for keys
152 // inside the same prefix. It will be the maximum number of linear search
153 // required after hash and binary search.
154 // index_sparseness = 0 means index for every key.
155 // huge_page_tlb_size determines whether to allocate hash indexes from huge
156 // page TLB and the page size if allocating from there. See comments of
157 // Arena::AllocateAligned() for details.
158 explicit PlainTableFactory(
159 const PlainTableOptions
& _table_options
= PlainTableOptions())
160 : table_options_(_table_options
) {}
162 const char* Name() const override
{ return "PlainTable"; }
163 Status
NewTableReader(const TableReaderOptions
& table_reader_options
,
164 std::unique_ptr
<RandomAccessFileReader
>&& file
,
165 uint64_t file_size
, std::unique_ptr
<TableReader
>* table
,
166 bool prefetch_index_and_filter_in_cache
) const override
;
168 TableBuilder
* NewTableBuilder(
169 const TableBuilderOptions
& table_builder_options
,
170 uint32_t column_family_id
, WritableFileWriter
* file
) const override
;
172 std::string
GetPrintableTableOptions() const override
;
174 const PlainTableOptions
& table_options() const;
176 static const char kValueTypeSeqId0
= char(~0);
178 // Sanitizes the specified DB Options.
179 Status
SanitizeOptions(
180 const DBOptions
& /*db_opts*/,
181 const ColumnFamilyOptions
& /*cf_opts*/) const override
{
185 void* GetOptions() override
{ return &table_options_
; }
187 Status
GetOptionString(std::string
* /*opt_string*/,
188 const std::string
& /*delimiter*/) const override
{
193 PlainTableOptions table_options_
;
196 static std::unordered_map
<std::string
, OptionTypeInfo
> plain_table_type_info
= {
198 {offsetof(struct PlainTableOptions
, user_key_len
), OptionType::kUInt32T
,
199 OptionVerificationType::kNormal
, false, 0}},
200 {"bloom_bits_per_key",
201 {offsetof(struct PlainTableOptions
, bloom_bits_per_key
), OptionType::kInt
,
202 OptionVerificationType::kNormal
, false, 0}},
204 {offsetof(struct PlainTableOptions
, hash_table_ratio
), OptionType::kDouble
,
205 OptionVerificationType::kNormal
, false, 0}},
207 {offsetof(struct PlainTableOptions
, index_sparseness
), OptionType::kSizeT
,
208 OptionVerificationType::kNormal
, false, 0}},
209 {"huge_page_tlb_size",
210 {offsetof(struct PlainTableOptions
, huge_page_tlb_size
),
211 OptionType::kSizeT
, OptionVerificationType::kNormal
, false, 0}},
213 {offsetof(struct PlainTableOptions
, encoding_type
),
214 OptionType::kEncodingType
, OptionVerificationType::kByName
, false, 0}},
216 {offsetof(struct PlainTableOptions
, full_scan_mode
), OptionType::kBoolean
,
217 OptionVerificationType::kNormal
, false, 0}},
218 {"store_index_in_file",
219 {offsetof(struct PlainTableOptions
, store_index_in_file
),
220 OptionType::kBoolean
, OptionVerificationType::kNormal
, false, 0}}};
222 } // namespace ROCKSDB_NAMESPACE
223 #endif // ROCKSDB_LITE