]>
Commit | Line | Data |
---|---|---|
f67539c2 | 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
7c673cae FG |
2 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
3 | // Use of this source code is governed by a BSD-style license that can be | |
4 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
5 | ||
6 | #pragma once | |
7 | ||
8 | #ifndef ROCKSDB_LITE | |
9 | #include <memory> | |
10 | #include <string> | |
11 | #include <stdint.h> | |
12 | ||
7c673cae FG |
13 | #include "rocksdb/table.h" |
14 | ||
f67539c2 | 15 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
16 | |
17 | struct EnvOptions; | |
18 | ||
7c673cae FG |
19 | class Status; |
20 | class RandomAccessFile; | |
21 | class WritableFile; | |
22 | class Table; | |
23 | class TableBuilder; | |
24 | ||
f67539c2 TL |
25 | // PlainTableFactory is the entrance function to the PlainTable format of |
26 | // SST files. It returns instances PlainTableBuilder as the builder | |
27 | // class and PlainTableReader as the reader class, where the format is | |
28 | // actually implemented. | |
29 | // | |
30 | // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. | |
31 | // Data is not organized in blocks, which allows fast access. Because of | |
32 | // following downsides | |
33 | // 1. Data compression is not supported. | |
34 | // 2. Data is not checksumed. | |
35 | // it is not recommended to use this format on other type of file systems. | |
20effc67 | 36 | // |
f67539c2 | 37 | // PlainTable requires fixed length key, configured as a constructor |
7c673cae FG |
38 | // parameter of the factory class. Output file format: |
39 | // +-------------+-----------------+ | |
40 | // | version | user_key_length | | |
41 | // +------------++------------+-----------------+ <= key1 offset | |
42 | // | encoded key1 | value_size | | | |
43 | // +------------+-------------+-------------+ | | |
44 | // | value1 | | |
45 | // | | | |
46 | // +--------------------------+-------------+---+ <= key2 offset | |
47 | // | encoded key2 | value_size | | | |
48 | // +------------+-------------+-------------+ | | |
49 | // | value2 | | |
50 | // | | | |
51 | // | ...... | | |
52 | // +-----------------+--------------------------+ | |
53 | // | |
54 | // When the key encoding type is kPlain. Key part is encoded as: | |
55 | // +------------+--------------------+ | |
56 | // | [key_size] | internal key | | |
57 | // +------------+--------------------+ | |
58 | // for the case of user_key_len = kPlainTableVariableLength case, | |
59 | // and simply: | |
60 | // +----------------------+ | |
61 | // | internal key | | |
62 | // +----------------------+ | |
63 | // for user_key_len != kPlainTableVariableLength case. | |
64 | // | |
65 | // If key encoding type is kPrefix. Keys are encoding in this format. | |
66 | // There are three ways to encode a key: | |
67 | // (1) Full Key | |
68 | // +---------------+---------------+-------------------+ | |
69 | // | Full Key Flag | Full Key Size | Full Internal Key | | |
70 | // +---------------+---------------+-------------------+ | |
71 | // which simply encodes a full key | |
72 | // | |
73 | // (2) A key shared the same prefix as the previous key, which is encoded as | |
74 | // format of (1). | |
75 | // +-------------+-------------+-------------+-------------+------------+ | |
76 | // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | | |
77 | // +-------------+-------------+-------------+-------------+------------+ | |
78 | // where key is the suffix part of the key, including the internal bytes. | |
79 | // the actual key will be constructed by concatenating prefix part of the | |
80 | // previous key, with the suffix part of the key here, with sizes given here. | |
81 | // | |
82 | // (3) A key shared the same prefix as the previous key, which is encoded as | |
83 | // the format of (2). | |
84 | // +-----------------+-----------------+------------------------+ | |
85 | // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | | |
86 | // +-----------------+-----------------+------------------------+ | |
87 | // The key will be constructed by concatenating previous key's prefix (which is | |
88 | // also a prefix which the last key encoded in the format of (1)) and the | |
89 | // key given here. | |
90 | // | |
91 | // For example, we for following keys (prefix and suffix are separated by | |
92 | // spaces): | |
93 | // 0000 0001 | |
94 | // 0000 00021 | |
95 | // 0000 0002 | |
96 | // 00011 00 | |
97 | // 0002 0001 | |
98 | // Will be encoded like this: | |
99 | // FK 8 00000001 | |
100 | // PF 4 SF 5 00021 | |
101 | // SF 4 0002 | |
102 | // FK 7 0001100 | |
103 | // FK 8 00020001 | |
104 | // (where FK means full key flag, PF means prefix flag and SF means suffix flag) | |
105 | // | |
106 | // All those "key flag + key size" shown above are in this format: | |
107 | // The 8 bits of the first byte: | |
108 | // +----+----+----+----+----+----+----+----+ | |
109 | // | Type | Size | | |
110 | // +----+----+----+----+----+----+----+----+ | |
111 | // Type indicates: full key, prefix, or suffix. | |
112 | // The last 6 bits are for size. If the size bits are not all 1, it means the | |
113 | // size of the key. Otherwise, varint32 is read after this byte. This varint | |
114 | // value + 0x3F (the value of all 1) will be the key size. | |
115 | // | |
116 | // For example, full key with length 16 will be encoded as (binary): | |
117 | // 00 010000 | |
118 | // (00 means full key) | |
119 | // and a prefix with 100 bytes will be encoded as: | |
120 | // 01 111111 00100101 | |
121 | // (63) (37) | |
122 | // (01 means key suffix) | |
123 | // | |
124 | // All the internal keys above (including kPlain and kPrefix) are encoded in | |
125 | // this format: | |
126 | // There are two types: | |
127 | // (1) normal internal key format | |
128 | // +----------- ...... -------------+----+---+---+---+---+---+---+---+ | |
129 | // | user key |type| sequence ID | | |
130 | // +----------- ..... --------------+----+---+---+---+---+---+---+---+ | |
131 | // (2) Special case for keys whose sequence ID is 0 and is value type | |
132 | // +----------- ...... -------------+----+ | |
133 | // | user key |0x80| | |
134 | // +----------- ..... --------------+----+ | |
135 | // To save 7 bytes for the special case where sequence ID = 0. | |
136 | // | |
137 | // | |
138 | class PlainTableFactory : public TableFactory { | |
139 | public: | |
140 | ~PlainTableFactory() {} | |
141 | // user_key_len is the length of the user key. If it is set to be | |
142 | // kPlainTableVariableLength, then it means variable length. Otherwise, all | |
143 | // the keys need to have the fix length of this value. bloom_bits_per_key is | |
144 | // number of bits used for bloom filer per key. hash_table_ratio is | |
145 | // the desired utilization of the hash table used for prefix hashing. | |
146 | // hash_table_ratio = number of prefixes / #buckets in the hash table | |
147 | // hash_table_ratio = 0 means skip hash table but only replying on binary | |
148 | // search. | |
149 | // index_sparseness determines index interval for keys | |
150 | // inside the same prefix. It will be the maximum number of linear search | |
151 | // required after hash and binary search. | |
152 | // index_sparseness = 0 means index for every key. | |
153 | // huge_page_tlb_size determines whether to allocate hash indexes from huge | |
154 | // page TLB and the page size if allocating from there. See comments of | |
155 | // Arena::AllocateAligned() for details. | |
156 | explicit PlainTableFactory( | |
20effc67 TL |
157 | const PlainTableOptions& _table_options = PlainTableOptions()); |
158 | ||
159 | // Method to allow CheckedCast to work for this class | |
160 | static const char* kClassName() { return kPlainTableName(); } | |
161 | const char* Name() const override { return kPlainTableName(); } | |
162 | using TableFactory::NewTableReader; | |
163 | Status NewTableReader(const ReadOptions& ro, | |
164 | const TableReaderOptions& table_reader_options, | |
494da23a TL |
165 | std::unique_ptr<RandomAccessFileReader>&& file, |
166 | uint64_t file_size, std::unique_ptr<TableReader>* table, | |
7c673cae FG |
167 | bool prefetch_index_and_filter_in_cache) const override; |
168 | ||
169 | TableBuilder* NewTableBuilder( | |
170 | const TableBuilderOptions& table_builder_options, | |
171 | uint32_t column_family_id, WritableFileWriter* file) const override; | |
172 | ||
20effc67 | 173 | std::string GetPrintableOptions() const override; |
11fdf7f2 | 174 | static const char kValueTypeSeqId0 = char(~0); |
7c673cae | 175 | |
7c673cae FG |
176 | private: |
177 | PlainTableOptions table_options_; | |
178 | }; | |
179 | ||
11fdf7f2 | 180 | |
f67539c2 | 181 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 182 | #endif // ROCKSDB_LITE |