]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/block_based_filter_block.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_filter_block.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
f67539c2 10#include "table/block_based/block_based_filter_block.h"
7c673cae
FG
11#include <algorithm>
12
13#include "db/dbformat.h"
14#include "monitoring/perf_context_imp.h"
15#include "rocksdb/filter_policy.h"
f67539c2 16#include "table/block_based/block_based_table_reader.h"
7c673cae
FG
17#include "util/coding.h"
18#include "util/string_util.h"
19
f67539c2 20namespace ROCKSDB_NAMESPACE {
7c673cae
FG
21
22namespace {
23
24void AppendItem(std::string* props, const std::string& key,
25 const std::string& value) {
26 char cspace = ' ';
27 std::string value_str("");
28 size_t i = 0;
29 const size_t dataLength = 64;
30 const size_t tabLength = 2;
31 const size_t offLength = 16;
32
33 value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
34 i += dataLength;
35 while (i < value.size()) {
36 value_str.append("\n");
37 value_str.append(offLength, cspace);
38 value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
39 i += dataLength;
40 }
41
42 std::string result("");
43 if (key.size() < (offLength - tabLength))
44 result.append(size_t((offLength - tabLength)) - key.size(), cspace);
45 result.append(key);
46
47 props->append(result + ": " + value_str + "\n");
48}
49
50template <class TKey>
51void AppendItem(std::string* props, const TKey& key, const std::string& value) {
f67539c2 52 std::string key_str = ROCKSDB_NAMESPACE::ToString(key);
7c673cae
FG
53 AppendItem(props, key_str, value);
54}
55} // namespace
56
7c673cae
FG
57// See doc/table_format.txt for an explanation of the filter block format.
58
59// Generate new filter every 2KB of data
60static const size_t kFilterBaseLg = 11;
61static const size_t kFilterBase = 1 << kFilterBaseLg;
62
63BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
64 const SliceTransform* prefix_extractor,
65 const BlockBasedTableOptions& table_opt)
66 : policy_(table_opt.filter_policy.get()),
67 prefix_extractor_(prefix_extractor),
68 whole_key_filtering_(table_opt.whole_key_filtering),
69 prev_prefix_start_(0),
11fdf7f2
TL
70 prev_prefix_size_(0),
71 num_added_(0) {
7c673cae
FG
72 assert(policy_);
73}
74
75void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
76 uint64_t filter_index = (block_offset / kFilterBase);
77 assert(filter_index >= filter_offsets_.size());
78 while (filter_index > filter_offsets_.size()) {
79 GenerateFilter();
80 }
81}
82
83void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
84 if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
85 AddPrefix(key);
86 }
87
88 if (whole_key_filtering_) {
89 AddKey(key);
90 }
91}
92
93// Add key to filter if needed
94inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
11fdf7f2 95 num_added_++;
7c673cae
FG
96 start_.push_back(entries_.size());
97 entries_.append(key.data(), key.size());
98}
99
100// Add prefix to filter if needed
101inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
102 // get slice for most recently added entry
103 Slice prev;
104 if (prev_prefix_size_ > 0) {
105 prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
106 }
107
108 Slice prefix = prefix_extractor_->Transform(key);
109 // insert prefix only when it's different from the previous prefix.
110 if (prev.size() == 0 || prefix != prev) {
7c673cae
FG
111 prev_prefix_start_ = entries_.size();
112 prev_prefix_size_ = prefix.size();
11fdf7f2 113 AddKey(prefix);
7c673cae
FG
114 }
115}
116
11fdf7f2 117Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
7c673cae
FG
118 Status* status) {
119 // In this impl we ignore BlockHandle
120 *status = Status::OK();
121 if (!start_.empty()) {
122 GenerateFilter();
123 }
124
125 // Append array of per-filter offsets
126 const uint32_t array_offset = static_cast<uint32_t>(result_.size());
127 for (size_t i = 0; i < filter_offsets_.size(); i++) {
128 PutFixed32(&result_, filter_offsets_[i]);
129 }
130
131 PutFixed32(&result_, array_offset);
132 result_.push_back(kFilterBaseLg); // Save encoding parameter in result
133 return Slice(result_);
134}
135
136void BlockBasedFilterBlockBuilder::GenerateFilter() {
137 const size_t num_entries = start_.size();
138 if (num_entries == 0) {
139 // Fast path if there are no keys for this filter
140 filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
141 return;
142 }
143
144 // Make list of keys from flattened key structure
145 start_.push_back(entries_.size()); // Simplify length computation
146 tmp_entries_.resize(num_entries);
147 for (size_t i = 0; i < num_entries; i++) {
148 const char* base = entries_.data() + start_[i];
149 size_t length = start_[i + 1] - start_[i];
150 tmp_entries_[i] = Slice(base, length);
151 }
152
153 // Generate filter for current set of keys and append to result_.
154 filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
155 policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
156 &result_);
157
158 tmp_entries_.clear();
159 entries_.clear();
160 start_.clear();
161 prev_prefix_start_ = 0;
162 prev_prefix_size_ = 0;
163}
164
165BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
f67539c2
TL
166 const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
167 : FilterBlockReaderCommon(t, std::move(filter_block)) {
168 assert(table());
169 assert(table()->get_rep());
170 assert(table()->get_rep()->filter_policy);
171}
172
173std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
20effc67
TL
174 const BlockBasedTable* table, const ReadOptions& ro,
175 FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
176 bool pin, BlockCacheLookupContext* lookup_context) {
f67539c2
TL
177 assert(table);
178 assert(table->get_rep());
179 assert(!pin || prefetch);
180
181 CachableEntry<BlockContents> filter_block;
182 if (prefetch || !use_cache) {
20effc67
TL
183 const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
184 nullptr /* get_context */, lookup_context,
185 &filter_block);
f67539c2 186 if (!s.ok()) {
20effc67 187 IGNORE_STATUS_IF_ERROR(s);
f67539c2
TL
188 return std::unique_ptr<FilterBlockReader>();
189 }
190
191 if (use_cache && !pin) {
192 filter_block.Reset();
193 }
194 }
195
196 return std::unique_ptr<FilterBlockReader>(
197 new BlockBasedFilterBlockReader(table, std::move(filter_block)));
7c673cae
FG
198}
199
200bool BlockBasedFilterBlockReader::KeyMayMatch(
11fdf7f2 201 const Slice& key, const SliceTransform* /* prefix_extractor */,
f67539c2
TL
202 uint64_t block_offset, const bool no_io,
203 const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
204 BlockCacheLookupContext* lookup_context) {
7c673cae 205 assert(block_offset != kNotValid);
f67539c2 206 if (!whole_key_filtering()) {
7c673cae
FG
207 return true;
208 }
f67539c2 209 return MayMatch(key, block_offset, no_io, get_context, lookup_context);
7c673cae
FG
210}
211
212bool BlockBasedFilterBlockReader::PrefixMayMatch(
11fdf7f2 213 const Slice& prefix, const SliceTransform* /* prefix_extractor */,
f67539c2
TL
214 uint64_t block_offset, const bool no_io,
215 const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
216 BlockCacheLookupContext* lookup_context) {
7c673cae 217 assert(block_offset != kNotValid);
f67539c2
TL
218 return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
219}
220
221bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
222 const BlockContents& contents, const char** data, const char** offset,
223 size_t* num, size_t* base_lg) {
224 assert(data);
225 assert(offset);
226 assert(num);
227 assert(base_lg);
228
229 const size_t n = contents.data.size();
230 if (n < 5) { // 1 byte for base_lg and 4 for start of offset array
231 return false;
232 }
233
234 const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
235 if (last_word > n - 5) {
236 return false;
237 }
238
239 *data = contents.data.data();
240 *offset = (*data) + last_word;
241 *num = (n - 5 - last_word) / 4;
242 *base_lg = contents.data[n - 1];
243
244 return true;
245}
246
247bool BlockBasedFilterBlockReader::MayMatch(
248 const Slice& entry, uint64_t block_offset, bool no_io,
249 GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
250 CachableEntry<BlockContents> filter_block;
251
252 const Status s =
253 GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
254 if (!s.ok()) {
255 return true;
256 }
257
258 assert(filter_block.GetValue());
259
260 const char* data = nullptr;
261 const char* offset = nullptr;
262 size_t num = 0;
263 size_t base_lg = 0;
264 if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
265 &base_lg)) {
266 return true; // Errors are treated as potential matches
267 }
268
269 const uint64_t index = block_offset >> base_lg;
270 if (index < num) {
271 const uint32_t start = DecodeFixed32(offset + index * 4);
272 const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
273 if (start <= limit && limit <= (uint32_t)(offset - data)) {
274 const Slice filter = Slice(data + start, limit - start);
275
276 assert(table());
277 assert(table()->get_rep());
278 const FilterPolicy* const policy = table()->get_rep()->filter_policy;
279
280 const bool may_match = policy->KeyMayMatch(entry, filter);
7c673cae
FG
281 if (may_match) {
282 PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
283 return true;
284 } else {
285 PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
286 return false;
287 }
288 } else if (start == limit) {
289 // Empty filters do not match any entries
290 return false;
291 }
292 }
293 return true; // Errors are treated as potential matches
294}
295
296size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
f67539c2
TL
297 size_t usage = ApproximateFilterBlockMemoryUsage();
298#ifdef ROCKSDB_MALLOC_USABLE_SIZE
299 usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
300#else
301 usage += sizeof(*this);
302#endif // ROCKSDB_MALLOC_USABLE_SIZE
303 return usage;
7c673cae
FG
304}
305
306std::string BlockBasedFilterBlockReader::ToString() const {
f67539c2
TL
307 CachableEntry<BlockContents> filter_block;
308
309 const Status s =
310 GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
311 nullptr /* lookup_context */, &filter_block);
312 if (!s.ok()) {
313 return std::string("Unable to retrieve filter block");
314 }
315
316 assert(filter_block.GetValue());
317
318 const char* data = nullptr;
319 const char* offset = nullptr;
320 size_t num = 0;
321 size_t base_lg = 0;
322 if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
323 &base_lg)) {
324 return std::string("Error parsing filter block");
325 }
326
11fdf7f2 327 std::string result;
7c673cae
FG
328 result.reserve(1024);
329
330 std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
f67539c2 331 AppendItem(&result, s_fb, ROCKSDB_NAMESPACE::ToString(num));
7c673cae
FG
332 AppendItem(&result, s_bo, s_hd);
333
f67539c2
TL
334 for (size_t index = 0; index < num; index++) {
335 uint32_t start = DecodeFixed32(offset + index * 4);
336 uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
7c673cae
FG
337
338 if (start != limit) {
f67539c2
TL
339 result.append(" filter block # " +
340 ROCKSDB_NAMESPACE::ToString(index + 1) + "\n");
341 Slice filter = Slice(data + start, limit - start);
7c673cae
FG
342 AppendItem(&result, start, filter.ToString(true));
343 }
344 }
345 return result;
346}
f67539c2
TL
347
348} // namespace ROCKSDB_NAMESPACE