[ceph.git] / ceph / src / rocksdb / table / block_based_filter_block.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "table/block_based_filter_block.h"
#include <algorithm>

#include "db/dbformat.h"
#include "monitoring/perf_context_imp.h"
#include "rocksdb/filter_policy.h"
#include "util/coding.h"
#include "util/string_util.h"

namespace rocksdb {

namespace {

void AppendItem(std::string* props, const std::string& key,
                const std::string& value) {
  char cspace = ' ';
  std::string value_str("");
  size_t i = 0;
  const size_t dataLength = 64;
  const size_t tabLength = 2;
  const size_t offLength = 16;

  value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
  i += dataLength;
  while (i < value.size()) {
    value_str.append("\n");
    value_str.append(offLength, cspace);
    value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
    i += dataLength;
  }

  std::string result("");
  if (key.size() < (offLength - tabLength))
    result.append(size_t((offLength - tabLength)) - key.size(), cspace);
  result.append(key);

  props->append(result + ": " + value_str + "\n");
}

template <class TKey>
void AppendItem(std::string* props, const TKey& key, const std::string& value) {
  std::string key_str = rocksdb::ToString(key);
  AppendItem(props, key_str, value);
}
}  // namespace


// See doc/table_format.txt for an explanation of the filter block format.

// Generate new filter every 2KB of data
static const size_t kFilterBaseLg = 11;
static const size_t kFilterBase = 1 << kFilterBaseLg;

BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
    const SliceTransform* prefix_extractor,
    const BlockBasedTableOptions& table_opt)
    : policy_(table_opt.filter_policy.get()),
      prefix_extractor_(prefix_extractor),
      whole_key_filtering_(table_opt.whole_key_filtering),
      prev_prefix_start_(0),
      prev_prefix_size_(0) {
  assert(policy_);
}

void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
  uint64_t filter_index = (block_offset / kFilterBase);
  assert(filter_index >= filter_offsets_.size());
  while (filter_index > filter_offsets_.size()) {
    GenerateFilter();
  }
}

void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
    AddPrefix(key);
  }

  if (whole_key_filtering_) {
    AddKey(key);
  }
}

// Add key to filter if needed
inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
  start_.push_back(entries_.size());
  entries_.append(key.data(), key.size());
}

// Add prefix to filter if needed
inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
  // get slice for most recently added entry
  Slice prev;
  if (prev_prefix_size_ > 0) {
    prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
  }

  Slice prefix = prefix_extractor_->Transform(key);
  // insert prefix only when it's different from the previous prefix.
  if (prev.size() == 0 || prefix != prev) {
    start_.push_back(entries_.size());
    prev_prefix_start_ = entries_.size();
    prev_prefix_size_ = prefix.size();
    entries_.append(prefix.data(), prefix.size());
  }
}

Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& tmp,
                                           Status* status) {
  // In this impl we ignore BlockHandle
  *status = Status::OK();
  if (!start_.empty()) {
    GenerateFilter();
  }

  // Append array of per-filter offsets
  const uint32_t array_offset = static_cast<uint32_t>(result_.size());
  for (size_t i = 0; i < filter_offsets_.size(); i++) {
    PutFixed32(&result_, filter_offsets_[i]);
  }

  PutFixed32(&result_, array_offset);
  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
  return Slice(result_);
}

void BlockBasedFilterBlockBuilder::GenerateFilter() {
  const size_t num_entries = start_.size();
  if (num_entries == 0) {
    // Fast path if there are no keys for this filter
    filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
    return;
  }

  // Make list of keys from flattened key structure
  start_.push_back(entries_.size());  // Simplify length computation
  tmp_entries_.resize(num_entries);
  for (size_t i = 0; i < num_entries; i++) {
    const char* base = entries_.data() + start_[i];
    size_t length = start_[i + 1] - start_[i];
    tmp_entries_[i] = Slice(base, length);
  }

  // Generate filter for current set of keys and append to result_.
  filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
  policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
                        &result_);

  tmp_entries_.clear();
  entries_.clear();
  start_.clear();
  prev_prefix_start_ = 0;
  prev_prefix_size_ = 0;
}

BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
    const SliceTransform* prefix_extractor,
    const BlockBasedTableOptions& table_opt, bool _whole_key_filtering,
    BlockContents&& contents, Statistics* stats)
    : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
      policy_(table_opt.filter_policy.get()),
      prefix_extractor_(prefix_extractor),
      data_(nullptr),
      offset_(nullptr),
      num_(0),
      base_lg_(0),
      contents_(std::move(contents)) {
  assert(policy_);
  size_t n = contents_.data.size();
  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
  base_lg_ = contents_.data[n - 1];
  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
  if (last_word > n - 5) return;
  data_ = contents_.data.data();
  offset_ = data_ + last_word;
  num_ = (n - 5 - last_word) / 4;
}

bool BlockBasedFilterBlockReader::KeyMayMatch(
    const Slice& key, uint64_t block_offset, const bool no_io,
    const Slice* const const_ikey_ptr) {
  assert(block_offset != kNotValid);
  if (!whole_key_filtering_) {
    return true;
  }
  return MayMatch(key, block_offset);
}

bool BlockBasedFilterBlockReader::PrefixMayMatch(
    const Slice& prefix, uint64_t block_offset, const bool no_io,
    const Slice* const const_ikey_ptr) {
  assert(block_offset != kNotValid);
  if (!prefix_extractor_) {
    return true;
  }
  return MayMatch(prefix, block_offset);
}

bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
                                           uint64_t block_offset) {
  uint64_t index = block_offset >> base_lg_;
  if (index < num_) {
    uint32_t start = DecodeFixed32(offset_ + index * 4);
    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
      Slice filter = Slice(data_ + start, limit - start);
      bool const may_match = policy_->KeyMayMatch(entry, filter);
      if (may_match) {
        PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
        return true;
      } else {
        PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
        return false;
      }
    } else if (start == limit) {
      // Empty filters do not match any entries
      return false;
    }
  }
  return true;  // Errors are treated as potential matches
}

size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
  return num_ * 4 + 5 + (offset_ - data_);
}

std::string BlockBasedFilterBlockReader::ToString() const {
  std::string result, filter_meta;
  result.reserve(1024);

  std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
  AppendItem(&result, s_fb, rocksdb::ToString(num_));
  AppendItem(&result, s_bo, s_hd);

  for (size_t index = 0; index < num_; index++) {
    uint32_t start = DecodeFixed32(offset_ + index * 4);
    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);

    if (start != limit) {
      result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n");
      Slice filter = Slice(data_ + start, limit - start);
      AppendItem(&result, start, filter.ToString(true));
    }
  }
  return result;
}
}  // namespace rocksdb
Commit	Line	Data
7c673cae FG	1	// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
	2	// This source code is licensed under the BSD-style license found in the
	3	// LICENSE file in the root directory of this source tree. An additional grant
	4	// of patent rights can be found in the PATENTS file in the same directory.
	5	//
	6	// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
	7	// Use of this source code is governed by a BSD-style license that can be
	8	// found in the LICENSE file. See the AUTHORS file for names of contributors.
	9
	10	#include "table/block_based_filter_block.h"
	11	#include <algorithm>
	12
	13	#include "db/dbformat.h"
	14	#include "monitoring/perf_context_imp.h"
	15	#include "rocksdb/filter_policy.h"
	16	#include "util/coding.h"
	17	#include "util/string_util.h"
	18
	19	namespace rocksdb {
	20
	21	namespace {
	22
	23	void AppendItem(std::string* props, const std::string& key,
	24	const std::string& value) {
	25	char cspace = ' ';
	26	std::string value_str("");
	27	size_t i = 0;
	28	const size_t dataLength = 64;
	29	const size_t tabLength = 2;
	30	const size_t offLength = 16;
	31
	32	value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
	33	i += dataLength;
	34	while (i < value.size()) {
	35	value_str.append("\n");
	36	value_str.append(offLength, cspace);
	37	value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
	38	i += dataLength;
	39	}
	40
	41	std::string result("");
	42	if (key.size() < (offLength - tabLength))
	43	result.append(size_t((offLength - tabLength)) - key.size(), cspace);
	44	result.append(key);
	45
	46	props->append(result + ": " + value_str + "\n");
	47	}
	48
	49	template <class TKey>
	50	void AppendItem(std::string* props, const TKey& key, const std::string& value) {
	51	std::string key_str = rocksdb::ToString(key);
	52	AppendItem(props, key_str, value);
	53	}
	54	} // namespace
	55
	56
	57	// See doc/table_format.txt for an explanation of the filter block format.
	58
	59	// Generate new filter every 2KB of data
	60	static const size_t kFilterBaseLg = 11;
	61	static const size_t kFilterBase = 1 << kFilterBaseLg;
	62
	63	BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
	64	const SliceTransform* prefix_extractor,
65	const BlockBasedTableOptions& table_opt)
66	: policy_(table_opt.filter_policy.get()),
67	prefix_extractor_(prefix_extractor),
68	whole_key_filtering_(table_opt.whole_key_filtering),
69	prev_prefix_start_(0),
70	prev_prefix_size_(0) {
71	assert(policy_);
72	}
73
74	void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
75	uint64_t filter_index = (block_offset / kFilterBase);
76	assert(filter_index >= filter_offsets_.size());
77	while (filter_index > filter_offsets_.size()) {
78	GenerateFilter();
79	}
80	}
81
82	void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
83	if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
84	AddPrefix(key);
85	}
86
87	if (whole_key_filtering_) {
88	AddKey(key);
89	}
90	}
91
92	// Add key to filter if needed
93	inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
94	start_.push_back(entries_.size());
95	entries_.append(key.data(), key.size());
96	}
97
98	// Add prefix to filter if needed
99	inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
100	// get slice for most recently added entry
101	Slice prev;
102	if (prev_prefix_size_ > 0) {
103	prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
104	}
105
106	Slice prefix = prefix_extractor_->Transform(key);
107	// insert prefix only when it's different from the previous prefix.
108	if (prev.size() == 0 \|\| prefix != prev) {
109	start_.push_back(entries_.size());
110	prev_prefix_start_ = entries_.size();
111	prev_prefix_size_ = prefix.size();
112	entries_.append(prefix.data(), prefix.size());
113	}
114	}
115
116	Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& tmp,
117	Status* status) {
118	// In this impl we ignore BlockHandle
119	*status = Status::OK();
120	if (!start_.empty()) {
121	GenerateFilter();
122	}
123
124	// Append array of per-filter offsets
125	const uint32_t array_offset = static_cast<uint32_t>(result_.size());
126	for (size_t i = 0; i < filter_offsets_.size(); i++) {
127	PutFixed32(&result_, filter_offsets_[i]);
128	}
129
130	PutFixed32(&result_, array_offset);
131	result_.push_back(kFilterBaseLg); // Save encoding parameter in result
132	return Slice(result_);
133	}
134
135	void BlockBasedFilterBlockBuilder::GenerateFilter() {
136	const size_t num_entries = start_.size();
137	if (num_entries == 0) {
138	// Fast path if there are no keys for this filter
139	filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
140	return;
141	}
142
143	// Make list of keys from flattened key structure
144	start_.push_back(entries_.size()); // Simplify length computation
145	tmp_entries_.resize(num_entries);
146	for (size_t i = 0; i < num_entries; i++) {
147	const char* base = entries_.data() + start_[i];
148	size_t length = start_[i + 1] - start_[i];
149	tmp_entries_[i] = Slice(base, length);
150	}
151
152	// Generate filter for current set of keys and append to result_.
153	filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
154	policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
155	&result_);
156
157	tmp_entries_.clear();
158	entries_.clear();
159	start_.clear();
160	prev_prefix_start_ = 0;
161	prev_prefix_size_ = 0;
162	}
163
164	BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
165	const SliceTransform* prefix_extractor,
166	const BlockBasedTableOptions& table_opt, bool _whole_key_filtering,
167	BlockContents&& contents, Statistics* stats)
168	: FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
169	policy_(table_opt.filter_policy.get()),
170	prefix_extractor_(prefix_extractor),
171	data_(nullptr),
172	offset_(nullptr),
173	num_(0),
174	base_lg_(0),
175	contents_(std::move(contents)) {
176	assert(policy_);
177	size_t n = contents_.data.size();
178	if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array
179	base_lg_ = contents_.data[n - 1];
180	uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
181	if (last_word > n - 5) return;
182	data_ = contents_.data.data();
183	offset_ = data_ + last_word;
184	num_ = (n - 5 - last_word) / 4;
185	}
186
187	bool BlockBasedFilterBlockReader::KeyMayMatch(
188	const Slice& key, uint64_t block_offset, const bool no_io,
189	const Slice* const const_ikey_ptr) {
190	assert(block_offset != kNotValid);
191	if (!whole_key_filtering_) {
192	return true;
193	}
194	return MayMatch(key, block_offset);
195	}
196
197	bool BlockBasedFilterBlockReader::PrefixMayMatch(
198	const Slice& prefix, uint64_t block_offset, const bool no_io,
199	const Slice* const const_ikey_ptr) {
200	assert(block_offset != kNotValid);
201	if (!prefix_extractor_) {
202	return true;
203	}
204	return MayMatch(prefix, block_offset);
205	}
206
207	bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
208	uint64_t block_offset) {
209	uint64_t index = block_offset >> base_lg_;
210	if (index < num_) {
211	uint32_t start = DecodeFixed32(offset_ + index * 4);
212	uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
213	if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
214	Slice filter = Slice(data_ + start, limit - start);
215	bool const may_match = policy_->KeyMayMatch(entry, filter);
216	if (may_match) {
217	PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
218	return true;
219	} else {
220	PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
221	return false;
222	}
223	} else if (start == limit) {
224	// Empty filters do not match any entries
225	return false;
226	}
227	}
228	return true; // Errors are treated as potential matches
229	}
230
231	size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
232	return num_ * 4 + 5 + (offset_ - data_);
233	}
234
235	std::string BlockBasedFilterBlockReader::ToString() const {
236	std::string result, filter_meta;
237	result.reserve(1024);
238
239	std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
240	AppendItem(&result, s_fb, rocksdb::ToString(num_));
241	AppendItem(&result, s_bo, s_hd);
242
243	for (size_t index = 0; index < num_; index++) {
244	uint32_t start = DecodeFixed32(offset_ + index * 4);
245	uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
246
247	if (start != limit) {
248	result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n");
249	Slice filter = Slice(data_ + start, limit - start);
250	AppendItem(&result, start, filter.ToString(true));
251	}
252	}
253	return result;
254	}
255	} // namespace rocksdb