]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain_table_index.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / table / plain_table_index.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5
6 #pragma once
7
8 #ifndef ROCKSDB_LITE
9
10 #include <string>
11 #include <vector>
12
13 #include "db/dbformat.h"
14 #include "monitoring/histogram.h"
15 #include "options/cf_options.h"
16 #include "rocksdb/options.h"
17 #include "util/arena.h"
18 #include "util/hash.h"
19 #include "util/murmurhash.h"
20
21 namespace rocksdb {
22
23 // PlainTableIndex contains buckets size of index_size_, each is a
24 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
25 // and the first bit of the integer indicates type of the offset.
26 //
27 // +--------------+------------------------------------------------------+
28 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
29 // +--------------+------------------------------------------------------+
30 //
31 // Explanation for the "flag bit":
32 //
33 // 0 indicates that the bucket contains only one prefix (no conflict when
34 // hashing this prefix), whose first row starts from this offset of the
35 // file.
36 // 1 indicates that the bucket contains more than one prefixes, or there
37 // are too many rows for one prefix so we need a binary search for it. In
38 // this case, the offset indicates the offset of sub_index_ holding the
39 // binary search indexes of keys for those rows. Those binary search indexes
40 // are organized in this way:
41 //
42 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
43 // it, there are N 32-bit integers, each points of an offset of the file,
44 // which
45 // points to starting of a row. Those offsets need to be guaranteed to be in
46 // ascending order so the keys they are pointing to are also in ascending
47 // order
48 // to make sure we can use them to do binary searches. Below is visual
49 // presentation of a bucket.
50 //
51 // <begin>
52 // number_of_records: varint32
53 // record 1 file offset: fixedint32
54 // record 2 file offset: fixedint32
55 // ....
56 // record N file offset: fixedint32
57 // <end>
58 class PlainTableIndex {
59 public:
60 enum IndexSearchResult {
61 kNoPrefixForBucket = 0,
62 kDirectToFile = 1,
63 kSubindex = 2
64 };
65
66 explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
67
68 PlainTableIndex()
69 : index_size_(0),
70 sub_index_size_(0),
71 num_prefixes_(0),
72 index_(nullptr),
73 sub_index_(nullptr) {}
74
75 IndexSearchResult GetOffset(uint32_t prefix_hash,
76 uint32_t* bucket_value) const;
77
78 Status InitFromRawData(Slice data);
79
80 const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
81 uint32_t* upper_bound) const {
82 const char* index_ptr = &sub_index_[offset];
83 return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
84 }
85
86 uint32_t GetIndexSize() const { return index_size_; }
87
88 uint32_t GetSubIndexSize() const { return sub_index_size_; }
89
90 uint32_t GetNumPrefixes() const { return num_prefixes_; }
91
92 static const uint64_t kMaxFileSize = (1u << 31) - 1;
93 static const uint32_t kSubIndexMask = 0x80000000;
94 static const size_t kOffsetLen = sizeof(uint32_t);
95
96 private:
97 uint32_t index_size_;
98 uint32_t sub_index_size_;
99 uint32_t num_prefixes_;
100
101 uint32_t* index_;
102 char* sub_index_;
103 };
104
105 // PlainTableIndexBuilder is used to create plain table index.
106 // After calling Finish(), it returns Slice, which is usually
107 // used either to initialize PlainTableIndex or
108 // to save index to sst file.
109 // For more details about the index, please refer to:
110 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
111 // #wiki-in-memory-index-format
112 class PlainTableIndexBuilder {
113 public:
114 PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
115 size_t index_sparseness, double hash_table_ratio,
116 size_t huge_page_tlb_size)
117 : arena_(arena),
118 ioptions_(ioptions),
119 record_list_(kRecordsPerGroup),
120 is_first_record_(true),
121 due_index_(false),
122 num_prefixes_(0),
123 num_keys_per_prefix_(0),
124 prev_key_prefix_hash_(0),
125 index_sparseness_(index_sparseness),
126 prefix_extractor_(ioptions.prefix_extractor),
127 hash_table_ratio_(hash_table_ratio),
128 huge_page_tlb_size_(huge_page_tlb_size) {}
129
130 void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
131
132 Slice Finish();
133
134 uint32_t GetTotalSize() const {
135 return VarintLength(index_size_) + VarintLength(num_prefixes_) +
136 PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
137 }
138
139 static const std::string kPlainTableIndexBlock;
140
141 private:
142 struct IndexRecord {
143 uint32_t hash; // hash of the prefix
144 uint32_t offset; // offset of a row
145 IndexRecord* next;
146 };
147
148 // Helper class to track all the index records
149 class IndexRecordList {
150 public:
151 explicit IndexRecordList(size_t num_records_per_group)
152 : kNumRecordsPerGroup(num_records_per_group),
153 current_group_(nullptr),
154 num_records_in_current_group_(num_records_per_group) {}
155
156 ~IndexRecordList() {
157 for (size_t i = 0; i < groups_.size(); i++) {
158 delete[] groups_[i];
159 }
160 }
161
162 void AddRecord(uint32_t hash, uint32_t offset);
163
164 size_t GetNumRecords() const {
165 return (groups_.size() - 1) * kNumRecordsPerGroup +
166 num_records_in_current_group_;
167 }
168 IndexRecord* At(size_t index) {
169 return &(groups_[index / kNumRecordsPerGroup]
170 [index % kNumRecordsPerGroup]);
171 }
172
173 private:
174 IndexRecord* AllocateNewGroup() {
175 IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
176 groups_.push_back(result);
177 return result;
178 }
179
180 // Each group in `groups_` contains fix-sized records (determined by
181 // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
182 // occurs.
183 const size_t kNumRecordsPerGroup;
184 IndexRecord* current_group_;
185 // List of arrays allocated
186 std::vector<IndexRecord*> groups_;
187 size_t num_records_in_current_group_;
188 };
189
190 void AllocateIndex();
191
192 // Internal helper function to bucket index record list to hash buckets.
193 void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
194 std::vector<uint32_t>* entries_per_bucket);
195
196 // Internal helper class to fill the indexes and bloom filters to internal
197 // data structures.
198 Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
199 const std::vector<uint32_t>& entries_per_bucket);
200
201 Arena* arena_;
202 const ImmutableCFOptions ioptions_;
203 HistogramImpl keys_per_prefix_hist_;
204 IndexRecordList record_list_;
205 bool is_first_record_;
206 bool due_index_;
207 uint32_t num_prefixes_;
208 uint32_t num_keys_per_prefix_;
209
210 uint32_t prev_key_prefix_hash_;
211 size_t index_sparseness_;
212 uint32_t index_size_;
213 uint32_t sub_index_size_;
214
215 const SliceTransform* prefix_extractor_;
216 double hash_table_ratio_;
217 size_t huge_page_tlb_size_;
218
219 std::string prev_key_prefix_;
220
221 static const size_t kRecordsPerGroup = 256;
222 };
223
224 }; // namespace rocksdb
225
226 #endif // ROCKSDB_LITE