1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 #include "table/partitioned_filter_block.h"
8 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
10 #include <malloc_np.h>
17 #include "monitoring/perf_context_imp.h"
18 #include "port/port.h"
19 #include "rocksdb/filter_policy.h"
20 #include "table/block.h"
21 #include "table/block_based_table_reader.h"
22 #include "util/coding.h"
26 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
27 const SliceTransform
* prefix_extractor
, bool whole_key_filtering
,
28 FilterBitsBuilder
* filter_bits_builder
, int index_block_restart_interval
,
29 const bool use_value_delta_encoding
,
30 PartitionedIndexBuilder
* const p_index_builder
,
31 const uint32_t partition_size
)
32 : FullFilterBlockBuilder(prefix_extractor
, whole_key_filtering
,
34 index_on_filter_block_builder_(index_block_restart_interval
,
35 true /*use_delta_encoding*/,
36 use_value_delta_encoding
),
37 index_on_filter_block_builder_without_seq_(index_block_restart_interval
,
38 true /*use_delta_encoding*/,
39 use_value_delta_encoding
),
40 p_index_builder_(p_index_builder
),
41 filters_in_partition_(0),
43 filters_per_partition_
=
44 filter_bits_builder_
->CalculateNumEntry(partition_size
);
47 PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
49 void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
50 // Use == to send the request only once
51 if (filters_in_partition_
== filters_per_partition_
) {
52 // Currently only index builder is in charge of cutting a partition. We keep
53 // requesting until it is granted.
54 p_index_builder_
->RequestPartitionCut();
56 if (!p_index_builder_
->ShouldCutFilterBlock()) {
59 filter_gc
.push_back(std::unique_ptr
<const char[]>(nullptr));
60 Slice filter
= filter_bits_builder_
->Finish(&filter_gc
.back());
61 std::string
& index_key
= p_index_builder_
->GetPartitionKey();
62 filters
.push_back({index_key
, filter
});
63 filters_in_partition_
= 0;
67 void PartitionedFilterBlockBuilder::AddKey(const Slice
& key
) {
68 MaybeCutAFilterBlock();
69 filter_bits_builder_
->AddKey(key
);
70 filters_in_partition_
++;
74 Slice
PartitionedFilterBlockBuilder::Finish(
75 const BlockHandle
& last_partition_block_handle
, Status
* status
) {
76 if (finishing_filters
== true) {
77 // Record the handle of the last written filter block in the index
78 FilterEntry
& last_entry
= filters
.front();
79 std::string handle_encoding
;
80 last_partition_block_handle
.EncodeTo(&handle_encoding
);
81 std::string handle_delta_encoding
;
83 &handle_delta_encoding
,
84 last_partition_block_handle
.size() - last_encoded_handle_
.size());
85 last_encoded_handle_
= last_partition_block_handle
;
86 const Slice
handle_delta_encoding_slice(handle_delta_encoding
);
87 index_on_filter_block_builder_
.Add(last_entry
.key
, handle_encoding
,
88 &handle_delta_encoding_slice
);
89 if (!p_index_builder_
->seperator_is_key_plus_seq()) {
90 index_on_filter_block_builder_without_seq_
.Add(
91 ExtractUserKey(last_entry
.key
), handle_encoding
,
92 &handle_delta_encoding_slice
);
96 MaybeCutAFilterBlock();
98 // If there is no filter partition left, then return the index on filter
100 if (UNLIKELY(filters
.empty())) {
101 *status
= Status::OK();
102 if (finishing_filters
) {
103 if (p_index_builder_
->seperator_is_key_plus_seq()) {
104 return index_on_filter_block_builder_
.Finish();
106 return index_on_filter_block_builder_without_seq_
.Finish();
109 // This is the rare case where no key was added to the filter
113 // Return the next filter partition in line and set Incomplete() status to
114 // indicate we expect more calls to Finish
115 *status
= Status::Incomplete();
116 finishing_filters
= true;
117 return filters
.front().filter
;
121 PartitionedFilterBlockReader::PartitionedFilterBlockReader(
122 const SliceTransform
* prefix_extractor
, bool _whole_key_filtering
,
123 BlockContents
&& contents
, FilterBitsReader
* /*filter_bits_reader*/,
124 Statistics
* stats
, const InternalKeyComparator comparator
,
125 const BlockBasedTable
* table
, const bool index_key_includes_seq
,
126 const bool index_value_is_full
)
127 : FilterBlockReader(contents
.data
.size(), stats
, _whole_key_filtering
),
128 prefix_extractor_(prefix_extractor
),
129 comparator_(comparator
),
131 index_key_includes_seq_(index_key_includes_seq
),
132 index_value_is_full_(index_value_is_full
) {
133 idx_on_fltr_blk_
.reset(new Block(std::move(contents
),
134 kDisableGlobalSequenceNumber
,
135 0 /* read_amp_bytes_per_bit */, stats
));
138 PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
139 // TODO(myabandeh): if instead of filter object we store only the blocks in
140 // block cache, then we don't have to manually earse them from block cache
142 auto block_cache
= table_
->rep_
->table_options
.block_cache
.get();
143 if (UNLIKELY(block_cache
== nullptr)) {
146 char cache_key
[BlockBasedTable::kMaxCacheKeyPrefixSize
+ kMaxVarint64Length
];
147 IndexBlockIter biter
;
149 Statistics
* kNullStats
= nullptr;
150 idx_on_fltr_blk_
->NewIterator
<IndexBlockIter
>(
151 &comparator_
, comparator_
.user_comparator(), &biter
, kNullStats
, true,
152 index_key_includes_seq_
, index_value_is_full_
);
154 for (; biter
.Valid(); biter
.Next()) {
155 handle
= biter
.value();
156 auto key
= BlockBasedTable::GetCacheKey(table_
->rep_
->cache_key_prefix
,
157 table_
->rep_
->cache_key_prefix_size
,
159 block_cache
->Erase(key
);
163 bool PartitionedFilterBlockReader::KeyMayMatch(
164 const Slice
& key
, const SliceTransform
* prefix_extractor
,
165 uint64_t block_offset
, const bool no_io
,
166 const Slice
* const const_ikey_ptr
) {
167 assert(const_ikey_ptr
!= nullptr);
168 assert(block_offset
== kNotValid
);
169 if (!whole_key_filtering_
) {
172 if (UNLIKELY(idx_on_fltr_blk_
->size() == 0)) {
175 auto filter_handle
= GetFilterPartitionHandle(*const_ikey_ptr
);
176 if (UNLIKELY(filter_handle
.size() == 0)) { // key is out of range
180 auto filter_partition
=
181 GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle
, no_io
,
182 &cached
, prefix_extractor
);
183 if (UNLIKELY(!filter_partition
.value
)) {
186 auto res
= filter_partition
.value
->KeyMayMatch(key
, prefix_extractor
,
187 block_offset
, no_io
);
191 if (LIKELY(filter_partition
.IsSet())) {
192 filter_partition
.Release(table_
->rep_
->table_options
.block_cache
.get());
194 delete filter_partition
.value
;
199 bool PartitionedFilterBlockReader::PrefixMayMatch(
200 const Slice
& prefix
, const SliceTransform
* prefix_extractor
,
201 uint64_t block_offset
, const bool no_io
,
202 const Slice
* const const_ikey_ptr
) {
206 assert(const_ikey_ptr
!= nullptr);
207 assert(block_offset
== kNotValid
);
208 if (!prefix_extractor_
&& !prefix_extractor
) {
211 if (UNLIKELY(idx_on_fltr_blk_
->size() == 0)) {
214 auto filter_handle
= GetFilterPartitionHandle(*const_ikey_ptr
);
215 if (UNLIKELY(filter_handle
.size() == 0)) { // prefix is out of range
219 auto filter_partition
=
220 GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle
, no_io
,
221 &cached
, prefix_extractor
);
222 if (UNLIKELY(!filter_partition
.value
)) {
225 auto res
= filter_partition
.value
->PrefixMayMatch(prefix
, prefix_extractor
,
230 if (LIKELY(filter_partition
.IsSet())) {
231 filter_partition
.Release(table_
->rep_
->table_options
.block_cache
.get());
233 delete filter_partition
.value
;
238 BlockHandle
PartitionedFilterBlockReader::GetFilterPartitionHandle(
239 const Slice
& entry
) {
241 Statistics
* kNullStats
= nullptr;
242 idx_on_fltr_blk_
->NewIterator
<IndexBlockIter
>(
243 &comparator_
, comparator_
.user_comparator(), &iter
, kNullStats
, true,
244 index_key_includes_seq_
, index_value_is_full_
);
246 if (UNLIKELY(!iter
.Valid())) {
247 return BlockHandle(0, 0);
249 assert(iter
.Valid());
250 BlockHandle fltr_blk_handle
= iter
.value();
251 return fltr_blk_handle
;
254 BlockBasedTable::CachableEntry
<FilterBlockReader
>
255 PartitionedFilterBlockReader::GetFilterPartition(
256 FilePrefetchBuffer
* prefetch_buffer
, BlockHandle
& fltr_blk_handle
,
257 const bool no_io
, bool* cached
, const SliceTransform
* prefix_extractor
) {
258 const bool is_a_filter_partition
= true;
259 auto block_cache
= table_
->rep_
->table_options
.block_cache
.get();
260 if (LIKELY(block_cache
!= nullptr)) {
261 if (filter_map_
.size() != 0) {
262 auto iter
= filter_map_
.find(fltr_blk_handle
.offset());
263 // This is a possible scenario since block cache might not have had space
265 if (iter
!= filter_map_
.end()) {
266 PERF_COUNTER_ADD(block_cache_hit_count
, 1);
267 RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT
);
268 RecordTick(statistics(), BLOCK_CACHE_HIT
);
269 RecordTick(statistics(), BLOCK_CACHE_BYTES_READ
,
270 block_cache
->GetUsage(iter
->second
.cache_handle
));
275 return table_
->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle
,
276 is_a_filter_partition
, no_io
,
277 /* get_context */ nullptr, prefix_extractor
);
279 auto filter
= table_
->ReadFilter(prefetch_buffer
, fltr_blk_handle
,
280 is_a_filter_partition
, prefix_extractor
);
281 return {filter
, nullptr};
285 size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
286 size_t usage
= idx_on_fltr_blk_
->usable_size();
287 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
288 usage
+= malloc_usable_size((void*)this);
290 usage
+= sizeof(*this);
291 #endif // ROCKSDB_MALLOC_USABLE_SIZE
293 // TODO(myabandeh): better estimation for filter_map_ size
296 // Release the cached entry and decrement its ref count.
297 void ReleaseFilterCachedEntry(void* arg
, void* h
) {
298 Cache
* cache
= reinterpret_cast<Cache
*>(arg
);
299 Cache::Handle
* handle
= reinterpret_cast<Cache::Handle
*>(h
);
300 cache
->Release(handle
);
303 // TODO(myabandeh): merge this with the same function in IndexReader
304 void PartitionedFilterBlockReader::CacheDependencies(
305 bool pin
, const SliceTransform
* prefix_extractor
) {
306 // Before read partitions, prefetch them to avoid lots of IOs
307 auto rep
= table_
->rep_
;
308 IndexBlockIter biter
;
309 Statistics
* kNullStats
= nullptr;
310 idx_on_fltr_blk_
->NewIterator
<IndexBlockIter
>(
311 &comparator_
, comparator_
.user_comparator(), &biter
, kNullStats
, true,
312 index_key_includes_seq_
, index_value_is_full_
);
313 // Index partitions are assumed to be consecuitive. Prefetch them all.
314 // Read the first block offset
316 BlockHandle handle
= biter
.value();
317 uint64_t prefetch_off
= handle
.offset();
319 // Read the last block's offset
321 handle
= biter
.value();
322 uint64_t last_off
= handle
.offset() + handle
.size() + kBlockTrailerSize
;
323 uint64_t prefetch_len
= last_off
- prefetch_off
;
324 std::unique_ptr
<FilePrefetchBuffer
> prefetch_buffer
;
325 auto& file
= table_
->rep_
->file
;
326 prefetch_buffer
.reset(new FilePrefetchBuffer());
328 s
= prefetch_buffer
->Prefetch(file
.get(), prefetch_off
,
329 static_cast<size_t>(prefetch_len
));
331 // After prefetch, read the partitions one by one
333 Cache
* block_cache
= rep
->table_options
.block_cache
.get();
334 for (; biter
.Valid(); biter
.Next()) {
335 handle
= biter
.value();
336 const bool no_io
= true;
337 const bool is_a_filter_partition
= true;
338 auto filter
= table_
->GetFilter(
339 prefetch_buffer
.get(), handle
, is_a_filter_partition
, !no_io
,
340 /* get_context */ nullptr, prefix_extractor
);
341 if (LIKELY(filter
.IsSet())) {
343 filter_map_
[handle
.offset()] = std::move(filter
);
344 RegisterCleanup(&ReleaseFilterCachedEntry
, block_cache
,
345 filter
.cache_handle
);
347 block_cache
->Release(filter
.cache_handle
);
355 } // namespace rocksdb