]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_factory.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_factory.h
CommitLineData
f67539c2 1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
7c673cae
FG
2// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
9#include <memory>
10#include <string>
11#include <stdint.h>
12
7c673cae
FG
13#include "rocksdb/table.h"
14
f67539c2 15namespace ROCKSDB_NAMESPACE {
7c673cae
FG
16
17struct EnvOptions;
18
7c673cae
FG
19class Status;
20class RandomAccessFile;
21class WritableFile;
22class Table;
23class TableBuilder;
24
f67539c2
TL
25// PlainTableFactory is the entrance function to the PlainTable format of
26// SST files. It returns instances PlainTableBuilder as the builder
27// class and PlainTableReader as the reader class, where the format is
28// actually implemented.
29//
30// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
31// Data is not organized in blocks, which allows fast access. Because of
32// following downsides
33// 1. Data compression is not supported.
34// 2. Data is not checksumed.
35// it is not recommended to use this format on other type of file systems.
20effc67 36//
f67539c2 37// PlainTable requires fixed length key, configured as a constructor
7c673cae
FG
38// parameter of the factory class. Output file format:
39// +-------------+-----------------+
40// | version | user_key_length |
41// +------------++------------+-----------------+ <= key1 offset
42// | encoded key1 | value_size | |
43// +------------+-------------+-------------+ |
44// | value1 |
45// | |
46// +--------------------------+-------------+---+ <= key2 offset
47// | encoded key2 | value_size | |
48// +------------+-------------+-------------+ |
49// | value2 |
50// | |
51// | ...... |
52// +-----------------+--------------------------+
53//
54// When the key encoding type is kPlain. Key part is encoded as:
55// +------------+--------------------+
56// | [key_size] | internal key |
57// +------------+--------------------+
58// for the case of user_key_len = kPlainTableVariableLength case,
59// and simply:
60// +----------------------+
61// | internal key |
62// +----------------------+
63// for user_key_len != kPlainTableVariableLength case.
64//
65// If key encoding type is kPrefix. Keys are encoding in this format.
66// There are three ways to encode a key:
67// (1) Full Key
68// +---------------+---------------+-------------------+
69// | Full Key Flag | Full Key Size | Full Internal Key |
70// +---------------+---------------+-------------------+
71// which simply encodes a full key
72//
73// (2) A key shared the same prefix as the previous key, which is encoded as
74// format of (1).
75// +-------------+-------------+-------------+-------------+------------+
76// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
77// +-------------+-------------+-------------+-------------+------------+
78// where key is the suffix part of the key, including the internal bytes.
79// the actual key will be constructed by concatenating prefix part of the
80// previous key, with the suffix part of the key here, with sizes given here.
81//
82// (3) A key shared the same prefix as the previous key, which is encoded as
83// the format of (2).
84// +-----------------+-----------------+------------------------+
85// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
86// +-----------------+-----------------+------------------------+
87// The key will be constructed by concatenating previous key's prefix (which is
88// also a prefix which the last key encoded in the format of (1)) and the
89// key given here.
90//
91// For example, we for following keys (prefix and suffix are separated by
92// spaces):
93// 0000 0001
94// 0000 00021
95// 0000 0002
96// 00011 00
97// 0002 0001
98// Will be encoded like this:
99// FK 8 00000001
100// PF 4 SF 5 00021
101// SF 4 0002
102// FK 7 0001100
103// FK 8 00020001
104// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
105//
106// All those "key flag + key size" shown above are in this format:
107// The 8 bits of the first byte:
108// +----+----+----+----+----+----+----+----+
109// | Type | Size |
110// +----+----+----+----+----+----+----+----+
111// Type indicates: full key, prefix, or suffix.
112// The last 6 bits are for size. If the size bits are not all 1, it means the
113// size of the key. Otherwise, varint32 is read after this byte. This varint
114// value + 0x3F (the value of all 1) will be the key size.
115//
116// For example, full key with length 16 will be encoded as (binary):
117// 00 010000
118// (00 means full key)
119// and a prefix with 100 bytes will be encoded as:
120// 01 111111 00100101
121// (63) (37)
122// (01 means key suffix)
123//
124// All the internal keys above (including kPlain and kPrefix) are encoded in
125// this format:
126// There are two types:
127// (1) normal internal key format
128// +----------- ...... -------------+----+---+---+---+---+---+---+---+
129// | user key |type| sequence ID |
130// +----------- ..... --------------+----+---+---+---+---+---+---+---+
131// (2) Special case for keys whose sequence ID is 0 and is value type
132// +----------- ...... -------------+----+
133// | user key |0x80|
134// +----------- ..... --------------+----+
135// To save 7 bytes for the special case where sequence ID = 0.
136//
137//
138class PlainTableFactory : public TableFactory {
139 public:
140 ~PlainTableFactory() {}
141 // user_key_len is the length of the user key. If it is set to be
142 // kPlainTableVariableLength, then it means variable length. Otherwise, all
143 // the keys need to have the fix length of this value. bloom_bits_per_key is
144 // number of bits used for bloom filer per key. hash_table_ratio is
145 // the desired utilization of the hash table used for prefix hashing.
146 // hash_table_ratio = number of prefixes / #buckets in the hash table
147 // hash_table_ratio = 0 means skip hash table but only replying on binary
148 // search.
149 // index_sparseness determines index interval for keys
150 // inside the same prefix. It will be the maximum number of linear search
151 // required after hash and binary search.
152 // index_sparseness = 0 means index for every key.
153 // huge_page_tlb_size determines whether to allocate hash indexes from huge
154 // page TLB and the page size if allocating from there. See comments of
155 // Arena::AllocateAligned() for details.
156 explicit PlainTableFactory(
20effc67
TL
157 const PlainTableOptions& _table_options = PlainTableOptions());
158
159 // Method to allow CheckedCast to work for this class
160 static const char* kClassName() { return kPlainTableName(); }
161 const char* Name() const override { return kPlainTableName(); }
162 using TableFactory::NewTableReader;
163 Status NewTableReader(const ReadOptions& ro,
164 const TableReaderOptions& table_reader_options,
494da23a
TL
165 std::unique_ptr<RandomAccessFileReader>&& file,
166 uint64_t file_size, std::unique_ptr<TableReader>* table,
7c673cae
FG
167 bool prefetch_index_and_filter_in_cache) const override;
168
169 TableBuilder* NewTableBuilder(
170 const TableBuilderOptions& table_builder_options,
171 uint32_t column_family_id, WritableFileWriter* file) const override;
172
20effc67 173 std::string GetPrintableOptions() const override;
11fdf7f2 174 static const char kValueTypeSeqId0 = char(~0);
7c673cae 175
7c673cae
FG
176 private:
177 PlainTableOptions table_options_;
178};
179
11fdf7f2 180
f67539c2 181} // namespace ROCKSDB_NAMESPACE
7c673cae 182#endif // ROCKSDB_LITE