1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
13 #include "db/dbformat.h"
14 #include "monitoring/histogram.h"
15 #include "options/cf_options.h"
16 #include "rocksdb/options.h"
17 #include "util/arena.h"
18 #include "util/hash.h"
19 #include "util/murmurhash.h"
23 // PlainTableIndex contains buckets size of index_size_, each is a
24 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
25 // and the first bit of the integer indicates type of the offset.
27 // +--------------+------------------------------------------------------+
28 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
29 // +--------------+------------------------------------------------------+
31 // Explanation for the "flag bit":
33 // 0 indicates that the bucket contains only one prefix (no conflict when
34 // hashing this prefix), whose first row starts from this offset of the
36 // 1 indicates that the bucket contains more than one prefixes, or there
37 // are too many rows for one prefix so we need a binary search for it. In
38 // this case, the offset indicates the offset of sub_index_ holding the
39 // binary search indexes of keys for those rows. Those binary search indexes
40 // are organized in this way:
42 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
43 // it, there are N 32-bit integers, each points of an offset of the file,
45 // points to starting of a row. Those offsets need to be guaranteed to be in
46 // ascending order so the keys they are pointing to are also in ascending
48 // to make sure we can use them to do binary searches. Below is visual
49 // presentation of a bucket.
52 // number_of_records: varint32
53 // record 1 file offset: fixedint32
54 // record 2 file offset: fixedint32
56 // record N file offset: fixedint32
58 class PlainTableIndex
{
60 enum IndexSearchResult
{
61 kNoPrefixForBucket
= 0,
66 explicit PlainTableIndex(Slice data
) { InitFromRawData(data
); }
73 sub_index_(nullptr) {}
75 IndexSearchResult
GetOffset(uint32_t prefix_hash
,
76 uint32_t* bucket_value
) const;
78 Status
InitFromRawData(Slice data
);
80 const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset
,
81 uint32_t* upper_bound
) const {
82 const char* index_ptr
= &sub_index_
[offset
];
83 return GetVarint32Ptr(index_ptr
, index_ptr
+ 4, upper_bound
);
86 uint32_t GetIndexSize() const { return index_size_
; }
88 uint32_t GetSubIndexSize() const { return sub_index_size_
; }
90 uint32_t GetNumPrefixes() const { return num_prefixes_
; }
92 static const uint64_t kMaxFileSize
= (1u << 31) - 1;
93 static const uint32_t kSubIndexMask
= 0x80000000;
94 static const size_t kOffsetLen
= sizeof(uint32_t);
98 uint32_t sub_index_size_
;
99 uint32_t num_prefixes_
;
105 // PlainTableIndexBuilder is used to create plain table index.
106 // After calling Finish(), it returns Slice, which is usually
107 // used either to initialize PlainTableIndex or
108 // to save index to sst file.
109 // For more details about the index, please refer to:
110 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
111 // #wiki-in-memory-index-format
112 class PlainTableIndexBuilder
{
114 PlainTableIndexBuilder(Arena
* arena
, const ImmutableCFOptions
& ioptions
,
115 size_t index_sparseness
, double hash_table_ratio
,
116 size_t huge_page_tlb_size
)
119 record_list_(kRecordsPerGroup
),
120 is_first_record_(true),
123 num_keys_per_prefix_(0),
124 prev_key_prefix_hash_(0),
125 index_sparseness_(index_sparseness
),
126 prefix_extractor_(ioptions
.prefix_extractor
),
127 hash_table_ratio_(hash_table_ratio
),
128 huge_page_tlb_size_(huge_page_tlb_size
) {}
130 void AddKeyPrefix(Slice key_prefix_slice
, uint32_t key_offset
);
134 uint32_t GetTotalSize() const {
135 return VarintLength(index_size_
) + VarintLength(num_prefixes_
) +
136 PlainTableIndex::kOffsetLen
* index_size_
+ sub_index_size_
;
139 static const std::string kPlainTableIndexBlock
;
143 uint32_t hash
; // hash of the prefix
144 uint32_t offset
; // offset of a row
148 // Helper class to track all the index records
149 class IndexRecordList
{
151 explicit IndexRecordList(size_t num_records_per_group
)
152 : kNumRecordsPerGroup(num_records_per_group
),
153 current_group_(nullptr),
154 num_records_in_current_group_(num_records_per_group
) {}
157 for (size_t i
= 0; i
< groups_
.size(); i
++) {
162 void AddRecord(uint32_t hash
, uint32_t offset
);
164 size_t GetNumRecords() const {
165 return (groups_
.size() - 1) * kNumRecordsPerGroup
+
166 num_records_in_current_group_
;
168 IndexRecord
* At(size_t index
) {
169 return &(groups_
[index
/ kNumRecordsPerGroup
]
170 [index
% kNumRecordsPerGroup
]);
174 IndexRecord
* AllocateNewGroup() {
175 IndexRecord
* result
= new IndexRecord
[kNumRecordsPerGroup
];
176 groups_
.push_back(result
);
180 // Each group in `groups_` contains fix-sized records (determined by
181 // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
183 const size_t kNumRecordsPerGroup
;
184 IndexRecord
* current_group_
;
185 // List of arrays allocated
186 std::vector
<IndexRecord
*> groups_
;
187 size_t num_records_in_current_group_
;
190 void AllocateIndex();
192 // Internal helper function to bucket index record list to hash buckets.
193 void BucketizeIndexes(std::vector
<IndexRecord
*>* hash_to_offsets
,
194 std::vector
<uint32_t>* entries_per_bucket
);
196 // Internal helper class to fill the indexes and bloom filters to internal
198 Slice
FillIndexes(const std::vector
<IndexRecord
*>& hash_to_offsets
,
199 const std::vector
<uint32_t>& entries_per_bucket
);
202 const ImmutableCFOptions ioptions_
;
203 HistogramImpl keys_per_prefix_hist_
;
204 IndexRecordList record_list_
;
205 bool is_first_record_
;
207 uint32_t num_prefixes_
;
208 uint32_t num_keys_per_prefix_
;
210 uint32_t prev_key_prefix_hash_
;
211 size_t index_sparseness_
;
212 uint32_t index_size_
;
213 uint32_t sub_index_size_
;
215 const SliceTransform
* prefix_extractor_
;
216 double hash_table_ratio_
;
217 size_t huge_page_tlb_size_
;
219 std::string prev_key_prefix_
;
221 static const size_t kRecordsPerGroup
= 256;
224 }; // namespace rocksdb
226 #endif // ROCKSDB_LITE