]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | |
6 | #pragma once | |
7 | ||
8 | #ifndef ROCKSDB_LITE | |
9 | ||
10 | #include <string> | |
11 | #include <vector> | |
12 | ||
f67539c2 | 13 | #include "memory/arena.h" |
7c673cae FG |
14 | #include "monitoring/histogram.h" |
15 | #include "options/cf_options.h" | |
16 | #include "rocksdb/options.h" | |
7c673cae | 17 | |
f67539c2 | 18 | namespace ROCKSDB_NAMESPACE { |
7c673cae | 19 | |
f67539c2 TL |
20 | // The file contains two classes PlainTableIndex and PlainTableIndexBuilder |
21 | // The two classes implement the index format of PlainTable. | |
20effc67 | 22 | // For description of PlainTable format, see comments of class |
f67539c2 TL |
23 | // PlainTableFactory |
24 | // | |
25 | // | |
7c673cae FG |
26 | // PlainTableIndex contains buckets size of index_size_, each is a |
27 | // 32-bit integer. The lower 31 bits contain an offset value (explained below) | |
28 | // and the first bit of the integer indicates type of the offset. | |
29 | // | |
30 | // +--------------+------------------------------------------------------+ | |
31 | // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + | |
32 | // +--------------+------------------------------------------------------+ | |
33 | // | |
34 | // Explanation for the "flag bit": | |
35 | // | |
36 | // 0 indicates that the bucket contains only one prefix (no conflict when | |
37 | // hashing this prefix), whose first row starts from this offset of the | |
38 | // file. | |
39 | // 1 indicates that the bucket contains more than one prefixes, or there | |
40 | // are too many rows for one prefix so we need a binary search for it. In | |
41 | // this case, the offset indicates the offset of sub_index_ holding the | |
42 | // binary search indexes of keys for those rows. Those binary search indexes | |
43 | // are organized in this way: | |
44 | // | |
45 | // The first 4 bytes, indicate how many indexes (N) are stored after it. After | |
46 | // it, there are N 32-bit integers, each points of an offset of the file, | |
47 | // which | |
48 | // points to starting of a row. Those offsets need to be guaranteed to be in | |
49 | // ascending order so the keys they are pointing to are also in ascending | |
50 | // order | |
51 | // to make sure we can use them to do binary searches. Below is visual | |
52 | // presentation of a bucket. | |
53 | // | |
54 | // <begin> | |
55 | // number_of_records: varint32 | |
56 | // record 1 file offset: fixedint32 | |
57 | // record 2 file offset: fixedint32 | |
58 | // .... | |
59 | // record N file offset: fixedint32 | |
60 | // <end> | |
f67539c2 TL |
61 | |
62 | // The class loads the index block from a PlainTable SST file, and executes | |
63 | // the index lookup. | |
64 | // The class is used by PlainTableReader class. | |
7c673cae FG |
65 | class PlainTableIndex { |
66 | public: | |
67 | enum IndexSearchResult { | |
68 | kNoPrefixForBucket = 0, | |
69 | kDirectToFile = 1, | |
70 | kSubindex = 2 | |
71 | }; | |
72 | ||
73 | explicit PlainTableIndex(Slice data) { InitFromRawData(data); } | |
74 | ||
75 | PlainTableIndex() | |
76 | : index_size_(0), | |
77 | sub_index_size_(0), | |
78 | num_prefixes_(0), | |
79 | index_(nullptr), | |
80 | sub_index_(nullptr) {} | |
81 | ||
f67539c2 TL |
82 | // The function that executes the lookup the hash table. |
83 | // The hash key is `prefix_hash`. The function fills the hash bucket | |
84 | // content in `bucket_value`, which is up to the caller to interpret. | |
7c673cae FG |
85 | IndexSearchResult GetOffset(uint32_t prefix_hash, |
86 | uint32_t* bucket_value) const; | |
87 | ||
f67539c2 TL |
88 | // Initialize data from `index_data`, which points to raw data for |
89 | // index stored in the SST file. | |
90 | Status InitFromRawData(Slice index_data); | |
7c673cae | 91 | |
f67539c2 TL |
92 | // Decode the sub index for specific hash bucket. |
93 | // The `offset` is the value returned as `bucket_value` by GetOffset() | |
94 | // and is only valid when the return value is `kSubindex`. | |
95 | // The return value is the pointer to the starting address of the | |
96 | // sub-index. `upper_bound` is filled with the value indicating how many | |
97 | // entries the sub-index has. | |
7c673cae FG |
98 | const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, |
99 | uint32_t* upper_bound) const { | |
100 | const char* index_ptr = &sub_index_[offset]; | |
101 | return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); | |
102 | } | |
103 | ||
104 | uint32_t GetIndexSize() const { return index_size_; } | |
105 | ||
106 | uint32_t GetSubIndexSize() const { return sub_index_size_; } | |
107 | ||
108 | uint32_t GetNumPrefixes() const { return num_prefixes_; } | |
109 | ||
110 | static const uint64_t kMaxFileSize = (1u << 31) - 1; | |
111 | static const uint32_t kSubIndexMask = 0x80000000; | |
112 | static const size_t kOffsetLen = sizeof(uint32_t); | |
113 | ||
114 | private: | |
115 | uint32_t index_size_; | |
116 | uint32_t sub_index_size_; | |
117 | uint32_t num_prefixes_; | |
118 | ||
119 | uint32_t* index_; | |
120 | char* sub_index_; | |
121 | }; | |
122 | ||
123 | // PlainTableIndexBuilder is used to create plain table index. | |
124 | // After calling Finish(), it returns Slice, which is usually | |
125 | // used either to initialize PlainTableIndex or | |
126 | // to save index to sst file. | |
f67539c2 | 127 | // For more details about the index, please refer to: |
7c673cae FG |
128 | // https://github.com/facebook/rocksdb/wiki/PlainTable-Format |
129 | // #wiki-in-memory-index-format | |
f67539c2 | 130 | // The class is used by PlainTableBuilder class. |
7c673cae FG |
131 | class PlainTableIndexBuilder { |
132 | public: | |
1e59de90 | 133 | PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions, |
11fdf7f2 | 134 | const SliceTransform* prefix_extractor, |
7c673cae FG |
135 | size_t index_sparseness, double hash_table_ratio, |
136 | size_t huge_page_tlb_size) | |
137 | : arena_(arena), | |
138 | ioptions_(ioptions), | |
139 | record_list_(kRecordsPerGroup), | |
140 | is_first_record_(true), | |
141 | due_index_(false), | |
142 | num_prefixes_(0), | |
143 | num_keys_per_prefix_(0), | |
144 | prev_key_prefix_hash_(0), | |
145 | index_sparseness_(index_sparseness), | |
11fdf7f2 TL |
146 | index_size_(0), |
147 | sub_index_size_(0), | |
148 | prefix_extractor_(prefix_extractor), | |
7c673cae FG |
149 | hash_table_ratio_(hash_table_ratio), |
150 | huge_page_tlb_size_(huge_page_tlb_size) {} | |
151 | ||
152 | void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); | |
153 | ||
154 | Slice Finish(); | |
155 | ||
156 | uint32_t GetTotalSize() const { | |
157 | return VarintLength(index_size_) + VarintLength(num_prefixes_) + | |
158 | PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; | |
159 | } | |
160 | ||
161 | static const std::string kPlainTableIndexBlock; | |
162 | ||
163 | private: | |
164 | struct IndexRecord { | |
165 | uint32_t hash; // hash of the prefix | |
166 | uint32_t offset; // offset of a row | |
167 | IndexRecord* next; | |
168 | }; | |
169 | ||
170 | // Helper class to track all the index records | |
171 | class IndexRecordList { | |
172 | public: | |
173 | explicit IndexRecordList(size_t num_records_per_group) | |
174 | : kNumRecordsPerGroup(num_records_per_group), | |
175 | current_group_(nullptr), | |
176 | num_records_in_current_group_(num_records_per_group) {} | |
177 | ||
178 | ~IndexRecordList() { | |
179 | for (size_t i = 0; i < groups_.size(); i++) { | |
180 | delete[] groups_[i]; | |
181 | } | |
182 | } | |
183 | ||
184 | void AddRecord(uint32_t hash, uint32_t offset); | |
185 | ||
186 | size_t GetNumRecords() const { | |
187 | return (groups_.size() - 1) * kNumRecordsPerGroup + | |
188 | num_records_in_current_group_; | |
189 | } | |
190 | IndexRecord* At(size_t index) { | |
1e59de90 TL |
191 | return &( |
192 | groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); | |
7c673cae FG |
193 | } |
194 | ||
195 | private: | |
196 | IndexRecord* AllocateNewGroup() { | |
197 | IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; | |
198 | groups_.push_back(result); | |
199 | return result; | |
200 | } | |
201 | ||
202 | // Each group in `groups_` contains fix-sized records (determined by | |
203 | // kNumRecordsPerGroup). Which can help us minimize the cost if resizing | |
204 | // occurs. | |
205 | const size_t kNumRecordsPerGroup; | |
206 | IndexRecord* current_group_; | |
207 | // List of arrays allocated | |
208 | std::vector<IndexRecord*> groups_; | |
209 | size_t num_records_in_current_group_; | |
210 | }; | |
211 | ||
212 | void AllocateIndex(); | |
213 | ||
214 | // Internal helper function to bucket index record list to hash buckets. | |
215 | void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, | |
216 | std::vector<uint32_t>* entries_per_bucket); | |
217 | ||
218 | // Internal helper class to fill the indexes and bloom filters to internal | |
219 | // data structures. | |
220 | Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, | |
221 | const std::vector<uint32_t>& entries_per_bucket); | |
222 | ||
223 | Arena* arena_; | |
1e59de90 | 224 | const ImmutableOptions ioptions_; |
7c673cae FG |
225 | HistogramImpl keys_per_prefix_hist_; |
226 | IndexRecordList record_list_; | |
227 | bool is_first_record_; | |
228 | bool due_index_; | |
229 | uint32_t num_prefixes_; | |
230 | uint32_t num_keys_per_prefix_; | |
231 | ||
232 | uint32_t prev_key_prefix_hash_; | |
233 | size_t index_sparseness_; | |
234 | uint32_t index_size_; | |
235 | uint32_t sub_index_size_; | |
236 | ||
237 | const SliceTransform* prefix_extractor_; | |
238 | double hash_table_ratio_; | |
239 | size_t huge_page_tlb_size_; | |
240 | ||
241 | std::string prev_key_prefix_; | |
242 | ||
243 | static const size_t kRecordsPerGroup = 256; | |
244 | }; | |
245 | ||
1e59de90 | 246 | } // namespace ROCKSDB_NAMESPACE |
7c673cae FG |
247 | |
248 | #endif // ROCKSDB_LITE |