]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/get_context.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / get_context.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #include "table/get_context.h"
7 #include "db/merge_helper.h"
8 #include "db/pinned_iterators_manager.h"
9 #include "db/read_callback.h"
10 #include "monitoring/file_read_sample.h"
11 #include "monitoring/perf_context_imp.h"
12 #include "monitoring/statistics.h"
13 #include "rocksdb/env.h"
14 #include "rocksdb/merge_operator.h"
15 #include "rocksdb/statistics.h"
16
17 namespace ROCKSDB_NAMESPACE {
18
19 namespace {
20
21 void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
22 #ifndef ROCKSDB_LITE
23 if (replay_log) {
24 if (replay_log->empty()) {
25 // Optimization: in the common case of only one operation in the
26 // log, we allocate the exact amount of space needed.
27 replay_log->reserve(1 + VarintLength(value.size()) + value.size());
28 }
29 replay_log->push_back(type);
30 PutLengthPrefixedSlice(replay_log, value);
31 }
32 #else
33 (void)replay_log;
34 (void)type;
35 (void)value;
36 #endif // ROCKSDB_LITE
37 }
38
39 } // namespace
40
41 GetContext::GetContext(
42 const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
43 Statistics* statistics, GetState init_state, const Slice& user_key,
44 PinnableSlice* pinnable_val, std::string* timestamp, bool* value_found,
45 MergeContext* merge_context, bool do_merge,
46 SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq,
47 PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
48 bool* is_blob_index, uint64_t tracing_get_id)
49 : ucmp_(ucmp),
50 merge_operator_(merge_operator),
51 logger_(logger),
52 statistics_(statistics),
53 state_(init_state),
54 user_key_(user_key),
55 pinnable_val_(pinnable_val),
56 timestamp_(timestamp),
57 value_found_(value_found),
58 merge_context_(merge_context),
59 max_covering_tombstone_seq_(_max_covering_tombstone_seq),
60 env_(env),
61 seq_(seq),
62 replay_log_(nullptr),
63 pinned_iters_mgr_(_pinned_iters_mgr),
64 callback_(callback),
65 do_merge_(do_merge),
66 is_blob_index_(is_blob_index),
67 tracing_get_id_(tracing_get_id) {
68 if (seq_) {
69 *seq_ = kMaxSequenceNumber;
70 }
71 sample_ = should_sample_file_read();
72 }
73
74 GetContext::GetContext(
75 const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
76 Statistics* statistics, GetState init_state, const Slice& user_key,
77 PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
78 bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
79 SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
80 ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
81 : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
82 pinnable_val, nullptr, value_found, merge_context, do_merge,
83 _max_covering_tombstone_seq, env, seq, _pinned_iters_mgr,
84 callback, is_blob_index, tracing_get_id) {}
85
86 // Called from TableCache::Get and Table::Get when file/block in which
87 // key may exist are not there in TableCache/BlockCache respectively. In this
88 // case we can't guarantee that key does not exist and are not permitted to do
89 // IO to be certain.Set the status=kFound and value_found=false to let the
90 // caller know that key may exist but is not there in memory
91 void GetContext::MarkKeyMayExist() {
92 state_ = kFound;
93 if (value_found_ != nullptr) {
94 *value_found_ = false;
95 }
96 }
97
98 void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
99 assert(state_ == kNotFound);
100 appendToReplayLog(replay_log_, kTypeValue, value);
101
102 state_ = kFound;
103 if (LIKELY(pinnable_val_ != nullptr)) {
104 pinnable_val_->PinSelf(value);
105 }
106 }
107
108 void GetContext::ReportCounters() {
109 if (get_context_stats_.num_cache_hit > 0) {
110 RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
111 }
112 if (get_context_stats_.num_cache_index_hit > 0) {
113 RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
114 get_context_stats_.num_cache_index_hit);
115 }
116 if (get_context_stats_.num_cache_data_hit > 0) {
117 RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
118 get_context_stats_.num_cache_data_hit);
119 }
120 if (get_context_stats_.num_cache_filter_hit > 0) {
121 RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
122 get_context_stats_.num_cache_filter_hit);
123 }
124 if (get_context_stats_.num_cache_compression_dict_hit > 0) {
125 RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
126 get_context_stats_.num_cache_compression_dict_hit);
127 }
128 if (get_context_stats_.num_cache_index_miss > 0) {
129 RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
130 get_context_stats_.num_cache_index_miss);
131 }
132 if (get_context_stats_.num_cache_filter_miss > 0) {
133 RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
134 get_context_stats_.num_cache_filter_miss);
135 }
136 if (get_context_stats_.num_cache_data_miss > 0) {
137 RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
138 get_context_stats_.num_cache_data_miss);
139 }
140 if (get_context_stats_.num_cache_compression_dict_miss > 0) {
141 RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
142 get_context_stats_.num_cache_compression_dict_miss);
143 }
144 if (get_context_stats_.num_cache_bytes_read > 0) {
145 RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
146 get_context_stats_.num_cache_bytes_read);
147 }
148 if (get_context_stats_.num_cache_miss > 0) {
149 RecordTick(statistics_, BLOCK_CACHE_MISS,
150 get_context_stats_.num_cache_miss);
151 }
152 if (get_context_stats_.num_cache_add > 0) {
153 RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
154 }
155 if (get_context_stats_.num_cache_add_redundant > 0) {
156 RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT,
157 get_context_stats_.num_cache_add_redundant);
158 }
159 if (get_context_stats_.num_cache_bytes_write > 0) {
160 RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
161 get_context_stats_.num_cache_bytes_write);
162 }
163 if (get_context_stats_.num_cache_index_add > 0) {
164 RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
165 get_context_stats_.num_cache_index_add);
166 }
167 if (get_context_stats_.num_cache_index_add_redundant > 0) {
168 RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT,
169 get_context_stats_.num_cache_index_add_redundant);
170 }
171 if (get_context_stats_.num_cache_index_bytes_insert > 0) {
172 RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
173 get_context_stats_.num_cache_index_bytes_insert);
174 }
175 if (get_context_stats_.num_cache_data_add > 0) {
176 RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
177 get_context_stats_.num_cache_data_add);
178 }
179 if (get_context_stats_.num_cache_data_add_redundant > 0) {
180 RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT,
181 get_context_stats_.num_cache_data_add_redundant);
182 }
183 if (get_context_stats_.num_cache_data_bytes_insert > 0) {
184 RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
185 get_context_stats_.num_cache_data_bytes_insert);
186 }
187 if (get_context_stats_.num_cache_filter_add > 0) {
188 RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
189 get_context_stats_.num_cache_filter_add);
190 }
191 if (get_context_stats_.num_cache_filter_add_redundant > 0) {
192 RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT,
193 get_context_stats_.num_cache_filter_add_redundant);
194 }
195 if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
196 RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
197 get_context_stats_.num_cache_filter_bytes_insert);
198 }
199 if (get_context_stats_.num_cache_compression_dict_add > 0) {
200 RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
201 get_context_stats_.num_cache_compression_dict_add);
202 }
203 if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) {
204 RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
205 get_context_stats_.num_cache_compression_dict_add_redundant);
206 }
207 if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
208 RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
209 get_context_stats_.num_cache_compression_dict_bytes_insert);
210 }
211 }
212
213 bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
214 const Slice& value, bool* matched,
215 Cleanable* value_pinner) {
216 assert(matched);
217 assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
218 merge_context_ != nullptr);
219 if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
220 *matched = true;
221 // If the value is not in the snapshot, skip it
222 if (!CheckCallback(parsed_key.sequence)) {
223 return true; // to continue to the next seq
224 }
225
226 appendToReplayLog(replay_log_, parsed_key.type, value);
227
228 if (seq_ != nullptr) {
229 // Set the sequence number if it is uninitialized
230 if (*seq_ == kMaxSequenceNumber) {
231 *seq_ = parsed_key.sequence;
232 }
233 }
234
235 auto type = parsed_key.type;
236 // Key matches. Process it
237 if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
238 max_covering_tombstone_seq_ != nullptr &&
239 *max_covering_tombstone_seq_ > parsed_key.sequence) {
240 type = kTypeRangeDeletion;
241 }
242 switch (type) {
243 case kTypeValue:
244 case kTypeBlobIndex:
245 assert(state_ == kNotFound || state_ == kMerge);
246 if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
247 // Blob value not supported. Stop.
248 state_ = kUnexpectedBlobIndex;
249 return false;
250 }
251 if (kNotFound == state_) {
252 state_ = kFound;
253 if (do_merge_) {
254 if (LIKELY(pinnable_val_ != nullptr)) {
255 if (LIKELY(value_pinner != nullptr)) {
256 // If the backing resources for the value are provided, pin them
257 pinnable_val_->PinSlice(value, value_pinner);
258 } else {
259 TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
260 this);
261
262 // Otherwise copy the value
263 pinnable_val_->PinSelf(value);
264 }
265 }
266 } else {
267 // It means this function is called as part of DB GetMergeOperands
268 // API and the current value should be part of
269 // merge_context_->operand_list
270 push_operand(value, value_pinner);
271 }
272 } else if (kMerge == state_) {
273 assert(merge_operator_ != nullptr);
274 state_ = kFound;
275 if (do_merge_) {
276 if (LIKELY(pinnable_val_ != nullptr)) {
277 Status merge_status = MergeHelper::TimedFullMerge(
278 merge_operator_, user_key_, &value,
279 merge_context_->GetOperands(), pinnable_val_->GetSelf(),
280 logger_, statistics_, env_);
281 pinnable_val_->PinSelf();
282 if (!merge_status.ok()) {
283 state_ = kCorrupt;
284 }
285 }
286 } else {
287 // It means this function is called as part of DB GetMergeOperands
288 // API and the current value should be part of
289 // merge_context_->operand_list
290 push_operand(value, value_pinner);
291 }
292 }
293 if (state_ == kFound) {
294 size_t ts_sz = ucmp_->timestamp_size();
295 if (ts_sz > 0 && timestamp_ != nullptr) {
296 Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
297 timestamp_->assign(ts.data(), ts.size());
298 }
299 }
300 if (is_blob_index_ != nullptr) {
301 *is_blob_index_ = (type == kTypeBlobIndex);
302 }
303 return false;
304
305 case kTypeDeletion:
306 case kTypeDeletionWithTimestamp:
307 case kTypeSingleDeletion:
308 case kTypeRangeDeletion:
309 // TODO(noetzli): Verify correctness once merge of single-deletes
310 // is supported
311 assert(state_ == kNotFound || state_ == kMerge);
312 if (kNotFound == state_) {
313 state_ = kDeleted;
314 } else if (kMerge == state_) {
315 state_ = kFound;
316 if (LIKELY(pinnable_val_ != nullptr)) {
317 if (do_merge_) {
318 Status merge_status = MergeHelper::TimedFullMerge(
319 merge_operator_, user_key_, nullptr,
320 merge_context_->GetOperands(), pinnable_val_->GetSelf(),
321 logger_, statistics_, env_);
322 pinnable_val_->PinSelf();
323 if (!merge_status.ok()) {
324 state_ = kCorrupt;
325 }
326 }
327 // If do_merge_ = false then the current value shouldn't be part of
328 // merge_context_->operand_list
329 }
330 }
331 return false;
332
333 case kTypeMerge:
334 assert(state_ == kNotFound || state_ == kMerge);
335 state_ = kMerge;
336 // value_pinner is not set from plain_table_reader.cc for example.
337 push_operand(value, value_pinner);
338 if (do_merge_ && merge_operator_ != nullptr &&
339 merge_operator_->ShouldMerge(
340 merge_context_->GetOperandsDirectionBackward())) {
341 state_ = kFound;
342 if (LIKELY(pinnable_val_ != nullptr)) {
343 // do_merge_ = true this is the case where this function is called
344 // as part of DB Get API hence merge operators should be merged.
345 if (do_merge_) {
346 Status merge_status = MergeHelper::TimedFullMerge(
347 merge_operator_, user_key_, nullptr,
348 merge_context_->GetOperands(), pinnable_val_->GetSelf(),
349 logger_, statistics_, env_);
350 pinnable_val_->PinSelf();
351 if (!merge_status.ok()) {
352 state_ = kCorrupt;
353 }
354 }
355 }
356 return false;
357 }
358 return true;
359
360 default:
361 assert(false);
362 break;
363 }
364 }
365
366 // state_ could be Corrupt, merge or notfound
367 return false;
368 }
369
370 void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
371 if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
372 value_pinner != nullptr) {
373 value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
374 merge_context_->PushOperand(value, true /*value_pinned*/);
375 } else {
376 merge_context_->PushOperand(value, false);
377 }
378 }
379
380 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
381 GetContext* get_context, Cleanable* value_pinner) {
382 #ifndef ROCKSDB_LITE
383 Slice s = replay_log;
384 while (s.size()) {
385 auto type = static_cast<ValueType>(*s.data());
386 s.remove_prefix(1);
387 Slice value;
388 bool ret = GetLengthPrefixedSlice(&s, &value);
389 assert(ret);
390 (void)ret;
391
392 bool dont_care __attribute__((__unused__));
393 // Since SequenceNumber is not stored and unknown, we will use
394 // kMaxSequenceNumber.
395 get_context->SaveValue(
396 ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
397 &dont_care, value_pinner);
398 }
399 #else // ROCKSDB_LITE
400 (void)replay_log;
401 (void)user_key;
402 (void)get_context;
403 (void)value_pinner;
404 assert(false);
405 #endif // ROCKSDB_LITE
406 }
407
408 } // namespace ROCKSDB_NAMESPACE