]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_index.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_index.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
9
10#include <string>
11#include <vector>
12
f67539c2 13#include "memory/arena.h"
7c673cae
FG
14#include "monitoring/histogram.h"
15#include "options/cf_options.h"
16#include "rocksdb/options.h"
7c673cae 17
f67539c2 18namespace ROCKSDB_NAMESPACE {
7c673cae 19
f67539c2
TL
20// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
21// The two classes implement the index format of PlainTable.
20effc67 22// For description of PlainTable format, see comments of class
f67539c2
TL
23// PlainTableFactory
24//
25//
7c673cae
FG
26// PlainTableIndex contains buckets size of index_size_, each is a
27// 32-bit integer. The lower 31 bits contain an offset value (explained below)
28// and the first bit of the integer indicates type of the offset.
29//
30// +--------------+------------------------------------------------------+
31// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
32// +--------------+------------------------------------------------------+
33//
34// Explanation for the "flag bit":
35//
36// 0 indicates that the bucket contains only one prefix (no conflict when
37// hashing this prefix), whose first row starts from this offset of the
38// file.
39// 1 indicates that the bucket contains more than one prefixes, or there
40// are too many rows for one prefix so we need a binary search for it. In
41// this case, the offset indicates the offset of sub_index_ holding the
42// binary search indexes of keys for those rows. Those binary search indexes
43// are organized in this way:
44//
45// The first 4 bytes, indicate how many indexes (N) are stored after it. After
46// it, there are N 32-bit integers, each points of an offset of the file,
47// which
48// points to starting of a row. Those offsets need to be guaranteed to be in
49// ascending order so the keys they are pointing to are also in ascending
50// order
51// to make sure we can use them to do binary searches. Below is visual
52// presentation of a bucket.
53//
54// <begin>
55// number_of_records: varint32
56// record 1 file offset: fixedint32
57// record 2 file offset: fixedint32
58// ....
59// record N file offset: fixedint32
60// <end>
f67539c2
TL
61
62// The class loads the index block from a PlainTable SST file, and executes
63// the index lookup.
64// The class is used by PlainTableReader class.
7c673cae
FG
65class PlainTableIndex {
66 public:
67 enum IndexSearchResult {
68 kNoPrefixForBucket = 0,
69 kDirectToFile = 1,
70 kSubindex = 2
71 };
72
73 explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
74
75 PlainTableIndex()
76 : index_size_(0),
77 sub_index_size_(0),
78 num_prefixes_(0),
79 index_(nullptr),
80 sub_index_(nullptr) {}
81
f67539c2
TL
82 // The function that executes the lookup the hash table.
83 // The hash key is `prefix_hash`. The function fills the hash bucket
84 // content in `bucket_value`, which is up to the caller to interpret.
7c673cae
FG
85 IndexSearchResult GetOffset(uint32_t prefix_hash,
86 uint32_t* bucket_value) const;
87
f67539c2
TL
88 // Initialize data from `index_data`, which points to raw data for
89 // index stored in the SST file.
90 Status InitFromRawData(Slice index_data);
7c673cae 91
f67539c2
TL
92 // Decode the sub index for specific hash bucket.
93 // The `offset` is the value returned as `bucket_value` by GetOffset()
94 // and is only valid when the return value is `kSubindex`.
95 // The return value is the pointer to the starting address of the
96 // sub-index. `upper_bound` is filled with the value indicating how many
97 // entries the sub-index has.
7c673cae
FG
98 const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
99 uint32_t* upper_bound) const {
100 const char* index_ptr = &sub_index_[offset];
101 return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
102 }
103
104 uint32_t GetIndexSize() const { return index_size_; }
105
106 uint32_t GetSubIndexSize() const { return sub_index_size_; }
107
108 uint32_t GetNumPrefixes() const { return num_prefixes_; }
109
110 static const uint64_t kMaxFileSize = (1u << 31) - 1;
111 static const uint32_t kSubIndexMask = 0x80000000;
112 static const size_t kOffsetLen = sizeof(uint32_t);
113
114 private:
115 uint32_t index_size_;
116 uint32_t sub_index_size_;
117 uint32_t num_prefixes_;
118
119 uint32_t* index_;
120 char* sub_index_;
121};
122
123// PlainTableIndexBuilder is used to create plain table index.
124// After calling Finish(), it returns Slice, which is usually
125// used either to initialize PlainTableIndex or
126// to save index to sst file.
f67539c2 127// For more details about the index, please refer to:
7c673cae
FG
128// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
129// #wiki-in-memory-index-format
f67539c2 130// The class is used by PlainTableBuilder class.
7c673cae
FG
131class PlainTableIndexBuilder {
132 public:
1e59de90 133 PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
11fdf7f2 134 const SliceTransform* prefix_extractor,
7c673cae
FG
135 size_t index_sparseness, double hash_table_ratio,
136 size_t huge_page_tlb_size)
137 : arena_(arena),
138 ioptions_(ioptions),
139 record_list_(kRecordsPerGroup),
140 is_first_record_(true),
141 due_index_(false),
142 num_prefixes_(0),
143 num_keys_per_prefix_(0),
144 prev_key_prefix_hash_(0),
145 index_sparseness_(index_sparseness),
11fdf7f2
TL
146 index_size_(0),
147 sub_index_size_(0),
148 prefix_extractor_(prefix_extractor),
7c673cae
FG
149 hash_table_ratio_(hash_table_ratio),
150 huge_page_tlb_size_(huge_page_tlb_size) {}
151
152 void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
153
154 Slice Finish();
155
156 uint32_t GetTotalSize() const {
157 return VarintLength(index_size_) + VarintLength(num_prefixes_) +
158 PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
159 }
160
161 static const std::string kPlainTableIndexBlock;
162
163 private:
164 struct IndexRecord {
165 uint32_t hash; // hash of the prefix
166 uint32_t offset; // offset of a row
167 IndexRecord* next;
168 };
169
170 // Helper class to track all the index records
171 class IndexRecordList {
172 public:
173 explicit IndexRecordList(size_t num_records_per_group)
174 : kNumRecordsPerGroup(num_records_per_group),
175 current_group_(nullptr),
176 num_records_in_current_group_(num_records_per_group) {}
177
178 ~IndexRecordList() {
179 for (size_t i = 0; i < groups_.size(); i++) {
180 delete[] groups_[i];
181 }
182 }
183
184 void AddRecord(uint32_t hash, uint32_t offset);
185
186 size_t GetNumRecords() const {
187 return (groups_.size() - 1) * kNumRecordsPerGroup +
188 num_records_in_current_group_;
189 }
190 IndexRecord* At(size_t index) {
1e59de90
TL
191 return &(
192 groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
7c673cae
FG
193 }
194
195 private:
196 IndexRecord* AllocateNewGroup() {
197 IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
198 groups_.push_back(result);
199 return result;
200 }
201
202 // Each group in `groups_` contains fix-sized records (determined by
203 // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
204 // occurs.
205 const size_t kNumRecordsPerGroup;
206 IndexRecord* current_group_;
207 // List of arrays allocated
208 std::vector<IndexRecord*> groups_;
209 size_t num_records_in_current_group_;
210 };
211
212 void AllocateIndex();
213
214 // Internal helper function to bucket index record list to hash buckets.
215 void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
216 std::vector<uint32_t>* entries_per_bucket);
217
218 // Internal helper class to fill the indexes and bloom filters to internal
219 // data structures.
220 Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
221 const std::vector<uint32_t>& entries_per_bucket);
222
223 Arena* arena_;
1e59de90 224 const ImmutableOptions ioptions_;
7c673cae
FG
225 HistogramImpl keys_per_prefix_hist_;
226 IndexRecordList record_list_;
227 bool is_first_record_;
228 bool due_index_;
229 uint32_t num_prefixes_;
230 uint32_t num_keys_per_prefix_;
231
232 uint32_t prev_key_prefix_hash_;
233 size_t index_sparseness_;
234 uint32_t index_size_;
235 uint32_t sub_index_size_;
236
237 const SliceTransform* prefix_extractor_;
238 double hash_table_ratio_;
239 size_t huge_page_tlb_size_;
240
241 std::string prev_key_prefix_;
242
243 static const size_t kRecordsPerGroup = 256;
244};
245
1e59de90 246} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
247
248#endif // ROCKSDB_LITE