]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_factory.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_factory.h
CommitLineData
f67539c2 1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
7c673cae
FG
2// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
1e59de90
TL
9#include <stdint.h>
10
7c673cae
FG
11#include <memory>
12#include <string>
7c673cae 13
7c673cae
FG
14#include "rocksdb/table.h"
15
f67539c2 16namespace ROCKSDB_NAMESPACE {
7c673cae
FG
17
18struct EnvOptions;
19
7c673cae
FG
20class Status;
21class RandomAccessFile;
22class WritableFile;
23class Table;
24class TableBuilder;
25
f67539c2
TL
26// PlainTableFactory is the entrance function to the PlainTable format of
27// SST files. It returns instances PlainTableBuilder as the builder
28// class and PlainTableReader as the reader class, where the format is
29// actually implemented.
30//
31// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
32// Data is not organized in blocks, which allows fast access. Because of
33// following downsides
34// 1. Data compression is not supported.
35// 2. Data is not checksumed.
36// it is not recommended to use this format on other type of file systems.
20effc67 37//
f67539c2 38// PlainTable requires fixed length key, configured as a constructor
7c673cae
FG
39// parameter of the factory class. Output file format:
40// +-------------+-----------------+
41// | version | user_key_length |
42// +------------++------------+-----------------+ <= key1 offset
43// | encoded key1 | value_size | |
44// +------------+-------------+-------------+ |
45// | value1 |
46// | |
47// +--------------------------+-------------+---+ <= key2 offset
48// | encoded key2 | value_size | |
49// +------------+-------------+-------------+ |
50// | value2 |
51// | |
52// | ...... |
53// +-----------------+--------------------------+
54//
55// When the key encoding type is kPlain. Key part is encoded as:
56// +------------+--------------------+
57// | [key_size] | internal key |
58// +------------+--------------------+
59// for the case of user_key_len = kPlainTableVariableLength case,
60// and simply:
61// +----------------------+
62// | internal key |
63// +----------------------+
64// for user_key_len != kPlainTableVariableLength case.
65//
66// If key encoding type is kPrefix. Keys are encoding in this format.
67// There are three ways to encode a key:
68// (1) Full Key
69// +---------------+---------------+-------------------+
70// | Full Key Flag | Full Key Size | Full Internal Key |
71// +---------------+---------------+-------------------+
72// which simply encodes a full key
73//
74// (2) A key shared the same prefix as the previous key, which is encoded as
75// format of (1).
76// +-------------+-------------+-------------+-------------+------------+
77// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
78// +-------------+-------------+-------------+-------------+------------+
79// where key is the suffix part of the key, including the internal bytes.
80// the actual key will be constructed by concatenating prefix part of the
81// previous key, with the suffix part of the key here, with sizes given here.
82//
83// (3) A key shared the same prefix as the previous key, which is encoded as
84// the format of (2).
85// +-----------------+-----------------+------------------------+
86// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
87// +-----------------+-----------------+------------------------+
88// The key will be constructed by concatenating previous key's prefix (which is
89// also a prefix which the last key encoded in the format of (1)) and the
90// key given here.
91//
92// For example, we for following keys (prefix and suffix are separated by
93// spaces):
94// 0000 0001
95// 0000 00021
96// 0000 0002
97// 00011 00
98// 0002 0001
99// Will be encoded like this:
100// FK 8 00000001
101// PF 4 SF 5 00021
102// SF 4 0002
103// FK 7 0001100
104// FK 8 00020001
105// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
106//
107// All those "key flag + key size" shown above are in this format:
108// The 8 bits of the first byte:
109// +----+----+----+----+----+----+----+----+
110// | Type | Size |
111// +----+----+----+----+----+----+----+----+
112// Type indicates: full key, prefix, or suffix.
113// The last 6 bits are for size. If the size bits are not all 1, it means the
114// size of the key. Otherwise, varint32 is read after this byte. This varint
115// value + 0x3F (the value of all 1) will be the key size.
116//
117// For example, full key with length 16 will be encoded as (binary):
118// 00 010000
119// (00 means full key)
120// and a prefix with 100 bytes will be encoded as:
121// 01 111111 00100101
122// (63) (37)
123// (01 means key suffix)
124//
125// All the internal keys above (including kPlain and kPrefix) are encoded in
126// this format:
127// There are two types:
128// (1) normal internal key format
129// +----------- ...... -------------+----+---+---+---+---+---+---+---+
130// | user key |type| sequence ID |
131// +----------- ..... --------------+----+---+---+---+---+---+---+---+
132// (2) Special case for keys whose sequence ID is 0 and is value type
133// +----------- ...... -------------+----+
134// | user key |0x80|
135// +----------- ..... --------------+----+
136// To save 7 bytes for the special case where sequence ID = 0.
137//
138//
139class PlainTableFactory : public TableFactory {
140 public:
141 ~PlainTableFactory() {}
142 // user_key_len is the length of the user key. If it is set to be
143 // kPlainTableVariableLength, then it means variable length. Otherwise, all
144 // the keys need to have the fix length of this value. bloom_bits_per_key is
145 // number of bits used for bloom filer per key. hash_table_ratio is
146 // the desired utilization of the hash table used for prefix hashing.
147 // hash_table_ratio = number of prefixes / #buckets in the hash table
148 // hash_table_ratio = 0 means skip hash table but only replying on binary
149 // search.
150 // index_sparseness determines index interval for keys
151 // inside the same prefix. It will be the maximum number of linear search
152 // required after hash and binary search.
153 // index_sparseness = 0 means index for every key.
154 // huge_page_tlb_size determines whether to allocate hash indexes from huge
155 // page TLB and the page size if allocating from there. See comments of
156 // Arena::AllocateAligned() for details.
157 explicit PlainTableFactory(
20effc67
TL
158 const PlainTableOptions& _table_options = PlainTableOptions());
159
160 // Method to allow CheckedCast to work for this class
161 static const char* kClassName() { return kPlainTableName(); }
162 const char* Name() const override { return kPlainTableName(); }
163 using TableFactory::NewTableReader;
164 Status NewTableReader(const ReadOptions& ro,
165 const TableReaderOptions& table_reader_options,
494da23a
TL
166 std::unique_ptr<RandomAccessFileReader>&& file,
167 uint64_t file_size, std::unique_ptr<TableReader>* table,
7c673cae
FG
168 bool prefetch_index_and_filter_in_cache) const override;
169
170 TableBuilder* NewTableBuilder(
171 const TableBuilderOptions& table_builder_options,
1e59de90 172 WritableFileWriter* file) const override;
7c673cae 173
20effc67 174 std::string GetPrintableOptions() const override;
11fdf7f2 175 static const char kValueTypeSeqId0 = char(~0);
7c673cae 176
7c673cae
FG
177 private:
178 PlainTableOptions table_options_;
179};
180
f67539c2 181} // namespace ROCKSDB_NAMESPACE
7c673cae 182#endif // ROCKSDB_LITE