1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
13 #include "memory/arena.h"
14 #include "monitoring/histogram.h"
15 #include "options/cf_options.h"
16 #include "rocksdb/options.h"
18 namespace ROCKSDB_NAMESPACE
{
20 // The file contains two classes PlainTableIndex and PlainTableIndexBuilder
21 // The two classes implement the index format of PlainTable.
22 // For description of PlainTable format, see comments of class
26 // PlainTableIndex contains buckets size of index_size_, each is a
27 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
28 // and the first bit of the integer indicates type of the offset.
30 // +--------------+------------------------------------------------------+
31 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
32 // +--------------+------------------------------------------------------+
34 // Explanation for the "flag bit":
36 // 0 indicates that the bucket contains only one prefix (no conflict when
37 // hashing this prefix), whose first row starts from this offset of the
39 // 1 indicates that the bucket contains more than one prefixes, or there
40 // are too many rows for one prefix so we need a binary search for it. In
41 // this case, the offset indicates the offset of sub_index_ holding the
42 // binary search indexes of keys for those rows. Those binary search indexes
43 // are organized in this way:
45 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
46 // it, there are N 32-bit integers, each points of an offset of the file,
48 // points to starting of a row. Those offsets need to be guaranteed to be in
49 // ascending order so the keys they are pointing to are also in ascending
51 // to make sure we can use them to do binary searches. Below is visual
52 // presentation of a bucket.
55 // number_of_records: varint32
56 // record 1 file offset: fixedint32
57 // record 2 file offset: fixedint32
59 // record N file offset: fixedint32
62 // The class loads the index block from a PlainTable SST file, and executes
64 // The class is used by PlainTableReader class.
65 class PlainTableIndex
{
67 enum IndexSearchResult
{
68 kNoPrefixForBucket
= 0,
73 explicit PlainTableIndex(Slice data
) { InitFromRawData(data
); }
80 sub_index_(nullptr) {}
82 // The function that executes the lookup the hash table.
83 // The hash key is `prefix_hash`. The function fills the hash bucket
84 // content in `bucket_value`, which is up to the caller to interpret.
85 IndexSearchResult
GetOffset(uint32_t prefix_hash
,
86 uint32_t* bucket_value
) const;
88 // Initialize data from `index_data`, which points to raw data for
89 // index stored in the SST file.
90 Status
InitFromRawData(Slice index_data
);
92 // Decode the sub index for specific hash bucket.
93 // The `offset` is the value returned as `bucket_value` by GetOffset()
94 // and is only valid when the return value is `kSubindex`.
95 // The return value is the pointer to the starting address of the
96 // sub-index. `upper_bound` is filled with the value indicating how many
97 // entries the sub-index has.
98 const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset
,
99 uint32_t* upper_bound
) const {
100 const char* index_ptr
= &sub_index_
[offset
];
101 return GetVarint32Ptr(index_ptr
, index_ptr
+ 4, upper_bound
);
104 uint32_t GetIndexSize() const { return index_size_
; }
106 uint32_t GetSubIndexSize() const { return sub_index_size_
; }
108 uint32_t GetNumPrefixes() const { return num_prefixes_
; }
110 static const uint64_t kMaxFileSize
= (1u << 31) - 1;
111 static const uint32_t kSubIndexMask
= 0x80000000;
112 static const size_t kOffsetLen
= sizeof(uint32_t);
115 uint32_t index_size_
;
116 uint32_t sub_index_size_
;
117 uint32_t num_prefixes_
;
123 // PlainTableIndexBuilder is used to create plain table index.
124 // After calling Finish(), it returns Slice, which is usually
125 // used either to initialize PlainTableIndex or
126 // to save index to sst file.
127 // For more details about the index, please refer to:
128 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
129 // #wiki-in-memory-index-format
130 // The class is used by PlainTableBuilder class.
131 class PlainTableIndexBuilder
{
133 PlainTableIndexBuilder(Arena
* arena
, const ImmutableOptions
& ioptions
,
134 const SliceTransform
* prefix_extractor
,
135 size_t index_sparseness
, double hash_table_ratio
,
136 size_t huge_page_tlb_size
)
139 record_list_(kRecordsPerGroup
),
140 is_first_record_(true),
143 num_keys_per_prefix_(0),
144 prev_key_prefix_hash_(0),
145 index_sparseness_(index_sparseness
),
148 prefix_extractor_(prefix_extractor
),
149 hash_table_ratio_(hash_table_ratio
),
150 huge_page_tlb_size_(huge_page_tlb_size
) {}
152 void AddKeyPrefix(Slice key_prefix_slice
, uint32_t key_offset
);
156 uint32_t GetTotalSize() const {
157 return VarintLength(index_size_
) + VarintLength(num_prefixes_
) +
158 PlainTableIndex::kOffsetLen
* index_size_
+ sub_index_size_
;
161 static const std::string kPlainTableIndexBlock
;
165 uint32_t hash
; // hash of the prefix
166 uint32_t offset
; // offset of a row
170 // Helper class to track all the index records
171 class IndexRecordList
{
173 explicit IndexRecordList(size_t num_records_per_group
)
174 : kNumRecordsPerGroup(num_records_per_group
),
175 current_group_(nullptr),
176 num_records_in_current_group_(num_records_per_group
) {}
179 for (size_t i
= 0; i
< groups_
.size(); i
++) {
184 void AddRecord(uint32_t hash
, uint32_t offset
);
186 size_t GetNumRecords() const {
187 return (groups_
.size() - 1) * kNumRecordsPerGroup
+
188 num_records_in_current_group_
;
190 IndexRecord
* At(size_t index
) {
192 groups_
[index
/ kNumRecordsPerGroup
][index
% kNumRecordsPerGroup
]);
196 IndexRecord
* AllocateNewGroup() {
197 IndexRecord
* result
= new IndexRecord
[kNumRecordsPerGroup
];
198 groups_
.push_back(result
);
202 // Each group in `groups_` contains fix-sized records (determined by
203 // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
205 const size_t kNumRecordsPerGroup
;
206 IndexRecord
* current_group_
;
207 // List of arrays allocated
208 std::vector
<IndexRecord
*> groups_
;
209 size_t num_records_in_current_group_
;
212 void AllocateIndex();
214 // Internal helper function to bucket index record list to hash buckets.
215 void BucketizeIndexes(std::vector
<IndexRecord
*>* hash_to_offsets
,
216 std::vector
<uint32_t>* entries_per_bucket
);
218 // Internal helper class to fill the indexes and bloom filters to internal
220 Slice
FillIndexes(const std::vector
<IndexRecord
*>& hash_to_offsets
,
221 const std::vector
<uint32_t>& entries_per_bucket
);
224 const ImmutableOptions ioptions_
;
225 HistogramImpl keys_per_prefix_hist_
;
226 IndexRecordList record_list_
;
227 bool is_first_record_
;
229 uint32_t num_prefixes_
;
230 uint32_t num_keys_per_prefix_
;
232 uint32_t prev_key_prefix_hash_
;
233 size_t index_sparseness_
;
234 uint32_t index_size_
;
235 uint32_t sub_index_size_
;
237 const SliceTransform
* prefix_extractor_
;
238 double hash_table_ratio_
;
239 size_t huge_page_tlb_size_
;
241 std::string prev_key_prefix_
;
243 static const size_t kRecordsPerGroup
= 256;
246 } // namespace ROCKSDB_NAMESPACE
248 #endif // ROCKSDB_LITE