]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_factory.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_factory.h
CommitLineData
f67539c2 1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
7c673cae
FG
2// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
9#include <memory>
10#include <string>
11#include <stdint.h>
12
11fdf7f2 13#include "options/options_helper.h"
7c673cae
FG
14#include "rocksdb/options.h"
15#include "rocksdb/table.h"
16
f67539c2 17namespace ROCKSDB_NAMESPACE {
7c673cae
FG
18
19struct EnvOptions;
20
7c673cae
FG
21class Status;
22class RandomAccessFile;
23class WritableFile;
24class Table;
25class TableBuilder;
26
f67539c2
TL
27// PlainTableFactory is the entrance function to the PlainTable format of
28// SST files. It returns instances PlainTableBuilder as the builder
29// class and PlainTableReader as the reader class, where the format is
30// actually implemented.
31//
32// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
33// Data is not organized in blocks, which allows fast access. Because of
34// following downsides
35// 1. Data compression is not supported.
36// 2. Data is not checksumed.
37// it is not recommended to use this format on other type of file systems.
38//
39// PlainTable requires fixed length key, configured as a constructor
7c673cae
FG
40// parameter of the factory class. Output file format:
41// +-------------+-----------------+
42// | version | user_key_length |
43// +------------++------------+-----------------+ <= key1 offset
44// | encoded key1 | value_size | |
45// +------------+-------------+-------------+ |
46// | value1 |
47// | |
48// +--------------------------+-------------+---+ <= key2 offset
49// | encoded key2 | value_size | |
50// +------------+-------------+-------------+ |
51// | value2 |
52// | |
53// | ...... |
54// +-----------------+--------------------------+
55//
56// When the key encoding type is kPlain. Key part is encoded as:
57// +------------+--------------------+
58// | [key_size] | internal key |
59// +------------+--------------------+
60// for the case of user_key_len = kPlainTableVariableLength case,
61// and simply:
62// +----------------------+
63// | internal key |
64// +----------------------+
65// for user_key_len != kPlainTableVariableLength case.
66//
67// If key encoding type is kPrefix. Keys are encoding in this format.
68// There are three ways to encode a key:
69// (1) Full Key
70// +---------------+---------------+-------------------+
71// | Full Key Flag | Full Key Size | Full Internal Key |
72// +---------------+---------------+-------------------+
73// which simply encodes a full key
74//
75// (2) A key shared the same prefix as the previous key, which is encoded as
76// format of (1).
77// +-------------+-------------+-------------+-------------+------------+
78// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
79// +-------------+-------------+-------------+-------------+------------+
80// where key is the suffix part of the key, including the internal bytes.
81// the actual key will be constructed by concatenating prefix part of the
82// previous key, with the suffix part of the key here, with sizes given here.
83//
84// (3) A key shared the same prefix as the previous key, which is encoded as
85// the format of (2).
86// +-----------------+-----------------+------------------------+
87// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
88// +-----------------+-----------------+------------------------+
89// The key will be constructed by concatenating previous key's prefix (which is
90// also a prefix which the last key encoded in the format of (1)) and the
91// key given here.
92//
93// For example, we for following keys (prefix and suffix are separated by
94// spaces):
95// 0000 0001
96// 0000 00021
97// 0000 0002
98// 00011 00
99// 0002 0001
100// Will be encoded like this:
101// FK 8 00000001
102// PF 4 SF 5 00021
103// SF 4 0002
104// FK 7 0001100
105// FK 8 00020001
106// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
107//
108// All those "key flag + key size" shown above are in this format:
109// The 8 bits of the first byte:
110// +----+----+----+----+----+----+----+----+
111// | Type | Size |
112// +----+----+----+----+----+----+----+----+
113// Type indicates: full key, prefix, or suffix.
114// The last 6 bits are for size. If the size bits are not all 1, it means the
115// size of the key. Otherwise, varint32 is read after this byte. This varint
116// value + 0x3F (the value of all 1) will be the key size.
117//
118// For example, full key with length 16 will be encoded as (binary):
119// 00 010000
120// (00 means full key)
121// and a prefix with 100 bytes will be encoded as:
122// 01 111111 00100101
123// (63) (37)
124// (01 means key suffix)
125//
126// All the internal keys above (including kPlain and kPrefix) are encoded in
127// this format:
128// There are two types:
129// (1) normal internal key format
130// +----------- ...... -------------+----+---+---+---+---+---+---+---+
131// | user key |type| sequence ID |
132// +----------- ..... --------------+----+---+---+---+---+---+---+---+
133// (2) Special case for keys whose sequence ID is 0 and is value type
134// +----------- ...... -------------+----+
135// | user key |0x80|
136// +----------- ..... --------------+----+
137// To save 7 bytes for the special case where sequence ID = 0.
138//
139//
140class PlainTableFactory : public TableFactory {
141 public:
142 ~PlainTableFactory() {}
143 // user_key_len is the length of the user key. If it is set to be
144 // kPlainTableVariableLength, then it means variable length. Otherwise, all
145 // the keys need to have the fix length of this value. bloom_bits_per_key is
146 // number of bits used for bloom filer per key. hash_table_ratio is
147 // the desired utilization of the hash table used for prefix hashing.
148 // hash_table_ratio = number of prefixes / #buckets in the hash table
149 // hash_table_ratio = 0 means skip hash table but only replying on binary
150 // search.
151 // index_sparseness determines index interval for keys
152 // inside the same prefix. It will be the maximum number of linear search
153 // required after hash and binary search.
154 // index_sparseness = 0 means index for every key.
155 // huge_page_tlb_size determines whether to allocate hash indexes from huge
156 // page TLB and the page size if allocating from there. See comments of
157 // Arena::AllocateAligned() for details.
158 explicit PlainTableFactory(
159 const PlainTableOptions& _table_options = PlainTableOptions())
160 : table_options_(_table_options) {}
161
162 const char* Name() const override { return "PlainTable"; }
163 Status NewTableReader(const TableReaderOptions& table_reader_options,
494da23a
TL
164 std::unique_ptr<RandomAccessFileReader>&& file,
165 uint64_t file_size, std::unique_ptr<TableReader>* table,
7c673cae
FG
166 bool prefetch_index_and_filter_in_cache) const override;
167
168 TableBuilder* NewTableBuilder(
169 const TableBuilderOptions& table_builder_options,
170 uint32_t column_family_id, WritableFileWriter* file) const override;
171
172 std::string GetPrintableTableOptions() const override;
173
174 const PlainTableOptions& table_options() const;
175
11fdf7f2 176 static const char kValueTypeSeqId0 = char(~0);
7c673cae
FG
177
178 // Sanitizes the specified DB Options.
11fdf7f2
TL
179 Status SanitizeOptions(
180 const DBOptions& /*db_opts*/,
181 const ColumnFamilyOptions& /*cf_opts*/) const override {
7c673cae
FG
182 return Status::OK();
183 }
184
185 void* GetOptions() override { return &table_options_; }
186
11fdf7f2
TL
187 Status GetOptionString(std::string* /*opt_string*/,
188 const std::string& /*delimiter*/) const override {
189 return Status::OK();
190 }
191
7c673cae
FG
192 private:
193 PlainTableOptions table_options_;
194};
195
11fdf7f2
TL
196static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
197 {"user_key_len",
198 {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
199 OptionVerificationType::kNormal, false, 0}},
200 {"bloom_bits_per_key",
201 {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
202 OptionVerificationType::kNormal, false, 0}},
203 {"hash_table_ratio",
204 {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
205 OptionVerificationType::kNormal, false, 0}},
206 {"index_sparseness",
207 {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
208 OptionVerificationType::kNormal, false, 0}},
209 {"huge_page_tlb_size",
210 {offsetof(struct PlainTableOptions, huge_page_tlb_size),
211 OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
212 {"encoding_type",
213 {offsetof(struct PlainTableOptions, encoding_type),
214 OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
215 {"full_scan_mode",
216 {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
217 OptionVerificationType::kNormal, false, 0}},
218 {"store_index_in_file",
219 {offsetof(struct PlainTableOptions, store_index_in_file),
220 OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
221
f67539c2 222} // namespace ROCKSDB_NAMESPACE
7c673cae 223#endif // ROCKSDB_LITE