]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/partitioned_filter_block.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / partitioned_filter_block.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #include "table/partitioned_filter_block.h"
7
8 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
9 #ifdef OS_FREEBSD
10 #include <malloc_np.h>
11 #else
12 #include <malloc.h>
13 #endif
14 #endif
15 #include <utility>
16
17 #include "monitoring/perf_context_imp.h"
18 #include "port/port.h"
19 #include "rocksdb/filter_policy.h"
20 #include "table/block.h"
21 #include "table/block_based_table_reader.h"
22 #include "util/coding.h"
23
24 namespace rocksdb {
25
26 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
27 const SliceTransform* prefix_extractor, bool whole_key_filtering,
28 FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
29 const bool use_value_delta_encoding,
30 PartitionedIndexBuilder* const p_index_builder,
31 const uint32_t partition_size)
32 : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
33 filter_bits_builder),
34 index_on_filter_block_builder_(index_block_restart_interval,
35 true /*use_delta_encoding*/,
36 use_value_delta_encoding),
37 index_on_filter_block_builder_without_seq_(index_block_restart_interval,
38 true /*use_delta_encoding*/,
39 use_value_delta_encoding),
40 p_index_builder_(p_index_builder),
41 filters_in_partition_(0),
42 num_added_(0) {
43 filters_per_partition_ =
44 filter_bits_builder_->CalculateNumEntry(partition_size);
45 }
46
47 PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
48
49 void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
50 // Use == to send the request only once
51 if (filters_in_partition_ == filters_per_partition_) {
52 // Currently only index builder is in charge of cutting a partition. We keep
53 // requesting until it is granted.
54 p_index_builder_->RequestPartitionCut();
55 }
56 if (!p_index_builder_->ShouldCutFilterBlock()) {
57 return;
58 }
59 filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
60 Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
61 std::string& index_key = p_index_builder_->GetPartitionKey();
62 filters.push_back({index_key, filter});
63 filters_in_partition_ = 0;
64 Reset();
65 }
66
67 void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
68 MaybeCutAFilterBlock();
69 filter_bits_builder_->AddKey(key);
70 filters_in_partition_++;
71 num_added_++;
72 }
73
74 Slice PartitionedFilterBlockBuilder::Finish(
75 const BlockHandle& last_partition_block_handle, Status* status) {
76 if (finishing_filters == true) {
77 // Record the handle of the last written filter block in the index
78 FilterEntry& last_entry = filters.front();
79 std::string handle_encoding;
80 last_partition_block_handle.EncodeTo(&handle_encoding);
81 std::string handle_delta_encoding;
82 PutVarsignedint64(
83 &handle_delta_encoding,
84 last_partition_block_handle.size() - last_encoded_handle_.size());
85 last_encoded_handle_ = last_partition_block_handle;
86 const Slice handle_delta_encoding_slice(handle_delta_encoding);
87 index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
88 &handle_delta_encoding_slice);
89 if (!p_index_builder_->seperator_is_key_plus_seq()) {
90 index_on_filter_block_builder_without_seq_.Add(
91 ExtractUserKey(last_entry.key), handle_encoding,
92 &handle_delta_encoding_slice);
93 }
94 filters.pop_front();
95 } else {
96 MaybeCutAFilterBlock();
97 }
98 // If there is no filter partition left, then return the index on filter
99 // partitions
100 if (UNLIKELY(filters.empty())) {
101 *status = Status::OK();
102 if (finishing_filters) {
103 if (p_index_builder_->seperator_is_key_plus_seq()) {
104 return index_on_filter_block_builder_.Finish();
105 } else {
106 return index_on_filter_block_builder_without_seq_.Finish();
107 }
108 } else {
109 // This is the rare case where no key was added to the filter
110 return Slice();
111 }
112 } else {
113 // Return the next filter partition in line and set Incomplete() status to
114 // indicate we expect more calls to Finish
115 *status = Status::Incomplete();
116 finishing_filters = true;
117 return filters.front().filter;
118 }
119 }
120
121 PartitionedFilterBlockReader::PartitionedFilterBlockReader(
122 const SliceTransform* prefix_extractor, bool _whole_key_filtering,
123 BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
124 Statistics* stats, const InternalKeyComparator comparator,
125 const BlockBasedTable* table, const bool index_key_includes_seq,
126 const bool index_value_is_full)
127 : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
128 prefix_extractor_(prefix_extractor),
129 comparator_(comparator),
130 table_(table),
131 index_key_includes_seq_(index_key_includes_seq),
132 index_value_is_full_(index_value_is_full) {
133 idx_on_fltr_blk_.reset(new Block(std::move(contents),
134 kDisableGlobalSequenceNumber,
135 0 /* read_amp_bytes_per_bit */, stats));
136 }
137
138 PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
139 // TODO(myabandeh): if instead of filter object we store only the blocks in
140 // block cache, then we don't have to manually earse them from block cache
141 // here.
142 auto block_cache = table_->rep_->table_options.block_cache.get();
143 if (UNLIKELY(block_cache == nullptr)) {
144 return;
145 }
146 char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
147 IndexBlockIter biter;
148 BlockHandle handle;
149 Statistics* kNullStats = nullptr;
150 idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
151 &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
152 index_key_includes_seq_, index_value_is_full_);
153 biter.SeekToFirst();
154 for (; biter.Valid(); biter.Next()) {
155 handle = biter.value();
156 auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
157 table_->rep_->cache_key_prefix_size,
158 handle, cache_key);
159 block_cache->Erase(key);
160 }
161 }
162
163 bool PartitionedFilterBlockReader::KeyMayMatch(
164 const Slice& key, const SliceTransform* prefix_extractor,
165 uint64_t block_offset, const bool no_io,
166 const Slice* const const_ikey_ptr) {
167 assert(const_ikey_ptr != nullptr);
168 assert(block_offset == kNotValid);
169 if (!whole_key_filtering_) {
170 return true;
171 }
172 if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
173 return true;
174 }
175 auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
176 if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range
177 return false;
178 }
179 bool cached = false;
180 auto filter_partition =
181 GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
182 &cached, prefix_extractor);
183 if (UNLIKELY(!filter_partition.value)) {
184 return true;
185 }
186 auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor,
187 block_offset, no_io);
188 if (cached) {
189 return res;
190 }
191 if (LIKELY(filter_partition.IsSet())) {
192 filter_partition.Release(table_->rep_->table_options.block_cache.get());
193 } else {
194 delete filter_partition.value;
195 }
196 return res;
197 }
198
199 bool PartitionedFilterBlockReader::PrefixMayMatch(
200 const Slice& prefix, const SliceTransform* prefix_extractor,
201 uint64_t block_offset, const bool no_io,
202 const Slice* const const_ikey_ptr) {
203 #ifdef NDEBUG
204 (void)block_offset;
205 #endif
206 assert(const_ikey_ptr != nullptr);
207 assert(block_offset == kNotValid);
208 if (!prefix_extractor_ && !prefix_extractor) {
209 return true;
210 }
211 if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
212 return true;
213 }
214 auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
215 if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range
216 return false;
217 }
218 bool cached = false;
219 auto filter_partition =
220 GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
221 &cached, prefix_extractor);
222 if (UNLIKELY(!filter_partition.value)) {
223 return true;
224 }
225 auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor,
226 kNotValid, no_io);
227 if (cached) {
228 return res;
229 }
230 if (LIKELY(filter_partition.IsSet())) {
231 filter_partition.Release(table_->rep_->table_options.block_cache.get());
232 } else {
233 delete filter_partition.value;
234 }
235 return res;
236 }
237
238 BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
239 const Slice& entry) {
240 IndexBlockIter iter;
241 Statistics* kNullStats = nullptr;
242 idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
243 &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
244 index_key_includes_seq_, index_value_is_full_);
245 iter.Seek(entry);
246 if (UNLIKELY(!iter.Valid())) {
247 return BlockHandle(0, 0);
248 }
249 assert(iter.Valid());
250 BlockHandle fltr_blk_handle = iter.value();
251 return fltr_blk_handle;
252 }
253
254 BlockBasedTable::CachableEntry<FilterBlockReader>
255 PartitionedFilterBlockReader::GetFilterPartition(
256 FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
257 const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
258 const bool is_a_filter_partition = true;
259 auto block_cache = table_->rep_->table_options.block_cache.get();
260 if (LIKELY(block_cache != nullptr)) {
261 if (filter_map_.size() != 0) {
262 auto iter = filter_map_.find(fltr_blk_handle.offset());
263 // This is a possible scenario since block cache might not have had space
264 // for the partition
265 if (iter != filter_map_.end()) {
266 PERF_COUNTER_ADD(block_cache_hit_count, 1);
267 RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
268 RecordTick(statistics(), BLOCK_CACHE_HIT);
269 RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
270 block_cache->GetUsage(iter->second.cache_handle));
271 *cached = true;
272 return iter->second;
273 }
274 }
275 return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
276 is_a_filter_partition, no_io,
277 /* get_context */ nullptr, prefix_extractor);
278 } else {
279 auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
280 is_a_filter_partition, prefix_extractor);
281 return {filter, nullptr};
282 }
283 }
284
285 size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
286 size_t usage = idx_on_fltr_blk_->usable_size();
287 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
288 usage += malloc_usable_size((void*)this);
289 #else
290 usage += sizeof(*this);
291 #endif // ROCKSDB_MALLOC_USABLE_SIZE
292 return usage;
293 // TODO(myabandeh): better estimation for filter_map_ size
294 }
295
296 // Release the cached entry and decrement its ref count.
297 void ReleaseFilterCachedEntry(void* arg, void* h) {
298 Cache* cache = reinterpret_cast<Cache*>(arg);
299 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
300 cache->Release(handle);
301 }
302
303 // TODO(myabandeh): merge this with the same function in IndexReader
304 void PartitionedFilterBlockReader::CacheDependencies(
305 bool pin, const SliceTransform* prefix_extractor) {
306 // Before read partitions, prefetch them to avoid lots of IOs
307 auto rep = table_->rep_;
308 IndexBlockIter biter;
309 Statistics* kNullStats = nullptr;
310 idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
311 &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
312 index_key_includes_seq_, index_value_is_full_);
313 // Index partitions are assumed to be consecuitive. Prefetch them all.
314 // Read the first block offset
315 biter.SeekToFirst();
316 BlockHandle handle = biter.value();
317 uint64_t prefetch_off = handle.offset();
318
319 // Read the last block's offset
320 biter.SeekToLast();
321 handle = biter.value();
322 uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
323 uint64_t prefetch_len = last_off - prefetch_off;
324 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
325 auto& file = table_->rep_->file;
326 prefetch_buffer.reset(new FilePrefetchBuffer());
327 Status s;
328 s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
329 static_cast<size_t>(prefetch_len));
330
331 // After prefetch, read the partitions one by one
332 biter.SeekToFirst();
333 Cache* block_cache = rep->table_options.block_cache.get();
334 for (; biter.Valid(); biter.Next()) {
335 handle = biter.value();
336 const bool no_io = true;
337 const bool is_a_filter_partition = true;
338 auto filter = table_->GetFilter(
339 prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
340 /* get_context */ nullptr, prefix_extractor);
341 if (LIKELY(filter.IsSet())) {
342 if (pin) {
343 filter_map_[handle.offset()] = std::move(filter);
344 RegisterCleanup(&ReleaseFilterCachedEntry, block_cache,
345 filter.cache_handle);
346 } else {
347 block_cache->Release(filter.cache_handle);
348 }
349 } else {
350 delete filter.value;
351 }
352 }
353 }
354
355 } // namespace rocksdb