]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / db / memtable.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <atomic>
12#include <deque>
13#include <functional>
14#include <memory>
15#include <string>
16#include <unordered_map>
17#include <vector>
18#include "db/dbformat.h"
494da23a 19#include "db/range_tombstone_fragmenter.h"
11fdf7f2 20#include "db/read_callback.h"
7c673cae 21#include "db/version_edit.h"
f67539c2
TL
22#include "memory/allocator.h"
23#include "memory/concurrent_arena.h"
7c673cae
FG
24#include "monitoring/instrumented_mutex.h"
25#include "options/cf_options.h"
26#include "rocksdb/db.h"
27#include "rocksdb/env.h"
28#include "rocksdb/memtablerep.h"
f67539c2 29#include "table/multiget_context.h"
7c673cae
FG
30#include "util/dynamic_bloom.h"
31#include "util/hash.h"
32
f67539c2 33namespace ROCKSDB_NAMESPACE {
7c673cae 34
f67539c2 35struct FlushJobInfo;
7c673cae
FG
36class Mutex;
37class MemTableIterator;
38class MergeContext;
7c673cae 39
11fdf7f2
TL
40struct ImmutableMemTableOptions {
41 explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
42 const MutableCFOptions& mutable_cf_options);
7c673cae
FG
43 size_t arena_block_size;
44 uint32_t memtable_prefix_bloom_bits;
45 size_t memtable_huge_page_size;
494da23a 46 bool memtable_whole_key_filtering;
7c673cae
FG
47 bool inplace_update_support;
48 size_t inplace_update_num_locks;
49 UpdateStatus (*inplace_callback)(char* existing_value,
50 uint32_t* existing_value_size,
51 Slice delta_value,
52 std::string* merged_value);
53 size_t max_successive_merges;
54 Statistics* statistics;
55 MergeOperator* merge_operator;
56 Logger* info_log;
57};
58
59// Batched counters to updated when inserting keys in one write batch.
60// In post process of the write batch, these can be updated together.
61// Only used in concurrent memtable insert case.
62struct MemTablePostProcessInfo {
63 uint64_t data_size = 0;
64 uint64_t num_entries = 0;
65 uint64_t num_deletes = 0;
66};
67
f67539c2 68using MultiGetRange = MultiGetContext::Range;
7c673cae 69// Note: Many of the methods in this class have comments indicating that
11fdf7f2 70// external synchronization is required as these methods are not thread-safe.
7c673cae
FG
71// It is up to higher layers of code to decide how to prevent concurrent
72// invokation of these methods. This is usually done by acquiring either
73// the db mutex or the single writer thread.
74//
75// Some of these methods are documented to only require external
76// synchronization if this memtable is immutable. Calling MarkImmutable() is
77// not sufficient to guarantee immutability. It is up to higher layers of
78// code to determine if this MemTable can still be modified by other threads.
79// Eg: The Superversion stores a pointer to the current MemTable (that can
80// be modified) and a separate list of the MemTables that can no longer be
81// written to (aka the 'immutable memtables').
82class MemTable {
83 public:
84 struct KeyComparator : public MemTableRep::KeyComparator {
85 const InternalKeyComparator comparator;
86 explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
87 virtual int operator()(const char* prefix_len_key1,
88 const char* prefix_len_key2) const override;
89 virtual int operator()(const char* prefix_len_key,
11fdf7f2 90 const DecodedType& key) const override;
7c673cae
FG
91 };
92
93 // MemTables are reference counted. The initial reference count
94 // is zero and the caller must call Ref() at least once.
95 //
96 // earliest_seq should be the current SequenceNumber in the db such that any
97 // key inserted into this memtable will have an equal or larger seq number.
98 // (When a db is first created, the earliest sequence number will be 0).
99 // If the earliest sequence number is not known, kMaxSequenceNumber may be
100 // used, but this may prevent some transactions from succeeding until the
101 // first key is inserted into the memtable.
102 explicit MemTable(const InternalKeyComparator& comparator,
103 const ImmutableCFOptions& ioptions,
104 const MutableCFOptions& mutable_cf_options,
105 WriteBufferManager* write_buffer_manager,
11fdf7f2 106 SequenceNumber earliest_seq, uint32_t column_family_id);
f67539c2
TL
107 // No copying allowed
108 MemTable(const MemTable&) = delete;
109 MemTable& operator=(const MemTable&) = delete;
7c673cae
FG
110
111 // Do not delete this MemTable unless Unref() indicates it not in use.
112 ~MemTable();
113
114 // Increase reference count.
115 // REQUIRES: external synchronization to prevent simultaneous
116 // operations on the same MemTable.
117 void Ref() { ++refs_; }
118
119 // Drop reference count.
120 // If the refcount goes to zero return this memtable, otherwise return null.
121 // REQUIRES: external synchronization to prevent simultaneous
122 // operations on the same MemTable.
123 MemTable* Unref() {
124 --refs_;
125 assert(refs_ >= 0);
126 if (refs_ <= 0) {
127 return this;
128 }
129 return nullptr;
130 }
131
132 // Returns an estimate of the number of bytes of data in use by this
133 // data structure.
134 //
135 // REQUIRES: external synchronization to prevent simultaneous
136 // operations on the same MemTable (unless this Memtable is immutable).
137 size_t ApproximateMemoryUsage();
138
f67539c2
TL
139 // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
140 // require external synchronization. The value may be less accurate though
141 size_t ApproximateMemoryUsageFast() const {
142 return approximate_memory_usage_.load(std::memory_order_relaxed);
143 }
144
7c673cae
FG
145 // This method heuristically determines if the memtable should continue to
146 // host more data.
147 bool ShouldScheduleFlush() const {
148 return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
149 }
150
151 // Returns true if a flush should be scheduled and the caller should
152 // be the one to schedule it
153 bool MarkFlushScheduled() {
154 auto before = FLUSH_REQUESTED;
155 return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
156 std::memory_order_relaxed,
157 std::memory_order_relaxed);
158 }
159
160 // Return an iterator that yields the contents of the memtable.
161 //
162 // The caller must ensure that the underlying MemTable remains live
163 // while the returned iterator is live. The keys returned by this
164 // iterator are internal keys encoded by AppendInternalKey in the
165 // db/dbformat.{h,cc} module.
166 //
167 // By default, it returns an iterator for prefix seek if prefix_extractor
168 // is configured in Options.
169 // arena: If not null, the arena needs to be used to allocate the Iterator.
170 // Calling ~Iterator of the iterator will destroy all the states but
171 // those allocated in arena.
172 InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
173
494da23a
TL
174 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
175 const ReadOptions& read_options, SequenceNumber read_seq);
7c673cae
FG
176
177 // Add an entry into memtable that maps key to value at the
178 // specified sequence number and with the specified type.
179 // Typically value will be empty if type==kTypeDeletion.
180 //
181 // REQUIRES: if allow_concurrent = false, external synchronization to prevent
182 // simultaneous operations on the same MemTable.
11fdf7f2
TL
183 //
184 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
185 // the <key, seq> already exists.
186 bool Add(SequenceNumber seq, ValueType type, const Slice& key,
7c673cae 187 const Slice& value, bool allow_concurrent = false,
f67539c2
TL
188 MemTablePostProcessInfo* post_process_info = nullptr,
189 void** hint = nullptr);
7c673cae 190
f67539c2
TL
191 // Used to Get value associated with key or Get Merge Operands associated
192 // with key.
193 // If do_merge = true the default behavior which is Get value for key is
194 // executed. Expected behavior is described right below.
7c673cae
FG
195 // If memtable contains a value for key, store it in *value and return true.
196 // If memtable contains a deletion for key, store a NotFound() error
197 // in *status and return true.
198 // If memtable contains Merge operation as the most recent entry for a key,
199 // and the merge process does not stop (not reaching a value or delete),
200 // prepend the current merge operand to *operands.
201 // store MergeInProgress in s, and return false.
202 // Else, return false.
203 // If any operation was found, its most recent sequence number
204 // will be stored in *seq on success (regardless of whether true/false is
205 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
206 // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
207 // status returned indicates a corruption or other unexpected error.
f67539c2
TL
208 // If do_merge = false then any Merge Operands encountered for key are simply
209 // stored in merge_context.operands_list and never actually merged to get a
210 // final value. The raw Merge Operands are eventually returned to the user.
7c673cae 211 bool Get(const LookupKey& key, std::string* value, Status* s,
494da23a
TL
212 MergeContext* merge_context,
213 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
214 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
f67539c2 215 bool* is_blob_index = nullptr, bool do_merge = true);
7c673cae
FG
216
217 bool Get(const LookupKey& key, std::string* value, Status* s,
494da23a
TL
218 MergeContext* merge_context,
219 SequenceNumber* max_covering_tombstone_seq,
11fdf7f2 220 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
f67539c2 221 bool* is_blob_index = nullptr, bool do_merge = true) {
7c673cae 222 SequenceNumber seq;
494da23a 223 return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
f67539c2 224 read_opts, callback, is_blob_index, do_merge);
7c673cae
FG
225 }
226
f67539c2
TL
227 void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
228 ReadCallback* callback, bool* is_blob);
229
7c673cae
FG
230 // Attempts to update the new_value inplace, else does normal Add
231 // Pseudocode
232 // if key exists in current memtable && prev_value is of type kTypeValue
233 // if new sizeof(new_value) <= sizeof(prev_value)
234 // update inplace
235 // else add(key, new_value)
236 // else add(key, new_value)
237 //
238 // REQUIRES: external synchronization to prevent simultaneous
239 // operations on the same MemTable.
240 void Update(SequenceNumber seq,
241 const Slice& key,
242 const Slice& value);
243
244 // If prev_value for key exists, attempts to update it inplace.
245 // else returns false
246 // Pseudocode
247 // if key exists in current memtable && prev_value is of type kTypeValue
248 // new_value = delta(prev_value)
249 // if sizeof(new_value) <= sizeof(prev_value)
250 // update inplace
251 // else add(key, new_value)
252 // else return false
253 //
254 // REQUIRES: external synchronization to prevent simultaneous
255 // operations on the same MemTable.
256 bool UpdateCallback(SequenceNumber seq,
257 const Slice& key,
258 const Slice& delta);
259
260 // Returns the number of successive merge entries starting from the newest
261 // entry for the key up to the last non-merge entry or last entry for the
262 // key in the memtable.
263 size_t CountSuccessiveMergeEntries(const LookupKey& key);
264
265 // Update counters and flush status after inserting a whole write batch
266 // Used in concurrent memtable inserts.
267 void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
268 num_entries_.fetch_add(update_counters.num_entries,
269 std::memory_order_relaxed);
270 data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
271 if (update_counters.num_deletes != 0) {
272 num_deletes_.fetch_add(update_counters.num_deletes,
273 std::memory_order_relaxed);
274 }
275 UpdateFlushState();
276 }
277
278 // Get total number of entries in the mem table.
279 // REQUIRES: external synchronization to prevent simultaneous
280 // operations on the same MemTable (unless this Memtable is immutable).
281 uint64_t num_entries() const {
282 return num_entries_.load(std::memory_order_relaxed);
283 }
284
285 // Get total number of deletes in the mem table.
286 // REQUIRES: external synchronization to prevent simultaneous
287 // operations on the same MemTable (unless this Memtable is immutable).
288 uint64_t num_deletes() const {
289 return num_deletes_.load(std::memory_order_relaxed);
290 }
291
494da23a
TL
292 uint64_t get_data_size() const {
293 return data_size_.load(std::memory_order_relaxed);
294 }
295
11fdf7f2
TL
296 // Dynamically change the memtable's capacity. If set below the current usage,
297 // the next key added will trigger a flush. Can only increase size when
298 // memtable prefix bloom is disabled, since we can't easily allocate more
299 // space.
300 void UpdateWriteBufferSize(size_t new_write_buffer_size) {
494da23a 301 if (bloom_filter_ == nullptr ||
11fdf7f2
TL
302 new_write_buffer_size < write_buffer_size_) {
303 write_buffer_size_.store(new_write_buffer_size,
304 std::memory_order_relaxed);
305 }
306 }
307
7c673cae
FG
308 // Returns the edits area that is needed for flushing the memtable
309 VersionEdit* GetEdits() { return &edit_; }
310
311 // Returns if there is no entry inserted to the mem table.
312 // REQUIRES: external synchronization to prevent simultaneous
313 // operations on the same MemTable (unless this Memtable is immutable).
314 bool IsEmpty() const { return first_seqno_ == 0; }
315
316 // Returns the sequence number of the first element that was inserted
317 // into the memtable.
318 // REQUIRES: external synchronization to prevent simultaneous
319 // operations on the same MemTable (unless this Memtable is immutable).
320 SequenceNumber GetFirstSequenceNumber() {
321 return first_seqno_.load(std::memory_order_relaxed);
322 }
323
324 // Returns the sequence number that is guaranteed to be smaller than or equal
325 // to the sequence number of any key that could be inserted into this
326 // memtable. It can then be assumed that any write with a larger(or equal)
327 // sequence number will be present in this memtable or a later memtable.
328 //
329 // If the earliest sequence number could not be determined,
330 // kMaxSequenceNumber will be returned.
331 SequenceNumber GetEarliestSequenceNumber() {
332 return earliest_seqno_.load(std::memory_order_relaxed);
333 }
334
335 // DB's latest sequence ID when the memtable is created. This number
336 // may be updated to a more recent one before any key is inserted.
337 SequenceNumber GetCreationSeq() const { return creation_seq_; }
338
339 void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
340
341 // Returns the next active logfile number when this memtable is about to
342 // be flushed to storage
343 // REQUIRES: external synchronization to prevent simultaneous
344 // operations on the same MemTable.
345 uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
346
347 // Sets the next active logfile number when this memtable is about to
348 // be flushed to storage
349 // REQUIRES: external synchronization to prevent simultaneous
350 // operations on the same MemTable.
351 void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
352
353 // if this memtable contains data from a committed
354 // two phase transaction we must take note of the
355 // log which contains that data so we can know
356 // when to relese that log
357 void RefLogContainingPrepSection(uint64_t log);
358 uint64_t GetMinLogContainingPrepSection();
359
360 // Notify the underlying storage that no more items will be added.
361 // REQUIRES: external synchronization to prevent simultaneous
362 // operations on the same MemTable.
363 // After MarkImmutable() is called, you should not attempt to
364 // write anything to this MemTable(). (Ie. do not call Add() or Update()).
365 void MarkImmutable() {
366 table_->MarkReadOnly();
11fdf7f2
TL
367 mem_tracker_.DoneAllocating();
368 }
369
370 // Notify the underlying storage that all data it contained has been
371 // persisted.
372 // REQUIRES: external synchronization to prevent simultaneous
373 // operations on the same MemTable.
374 void MarkFlushed() {
375 table_->MarkFlushed();
7c673cae
FG
376 }
377
378 // return true if the current MemTableRep supports merge operator.
379 bool IsMergeOperatorSupported() const {
380 return table_->IsMergeOperatorSupported();
381 }
382
383 // return true if the current MemTableRep supports snapshots.
384 // inplace update prevents snapshots,
385 bool IsSnapshotSupported() const {
386 return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
387 }
388
389 struct MemTableStats {
390 uint64_t size;
391 uint64_t count;
392 };
393
394 MemTableStats ApproximateStats(const Slice& start_ikey,
395 const Slice& end_ikey);
396
397 // Get the lock associated for the key
398 port::RWMutex* GetLock(const Slice& key);
399
400 const InternalKeyComparator& GetInternalKeyComparator() const {
401 return comparator_.comparator;
402 }
403
11fdf7f2
TL
404 const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
405 return &moptions_;
406 }
407
408 uint64_t ApproximateOldestKeyTime() const {
409 return oldest_key_time_.load(std::memory_order_relaxed);
410 }
411
412 // REQUIRES: db_mutex held.
413 void SetID(uint64_t id) { id_ = id; }
414
415 uint64_t GetID() const { return id_; }
7c673cae 416
494da23a
TL
417 void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
418
419 uint64_t GetFileNumber() const { return file_number_; }
420
421 void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
422
423 void SetFlushInProgress(bool in_progress) {
424 flush_in_progress_ = in_progress;
425 }
426
f67539c2
TL
427#ifndef ROCKSDB_LITE
428 void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
429 flush_job_info_ = std::move(info);
430 }
431
432 std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
433 return std::move(flush_job_info_);
434 }
435#endif // !ROCKSDB_LITE
436
7c673cae
FG
437 private:
438 enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
439
440 friend class MemTableIterator;
441 friend class MemTableBackwardIterator;
442 friend class MemTableList;
443
444 KeyComparator comparator_;
11fdf7f2 445 const ImmutableMemTableOptions moptions_;
7c673cae
FG
446 int refs_;
447 const size_t kArenaBlockSize;
11fdf7f2 448 AllocTracker mem_tracker_;
7c673cae 449 ConcurrentArena arena_;
494da23a
TL
450 std::unique_ptr<MemTableRep> table_;
451 std::unique_ptr<MemTableRep> range_del_table_;
452 std::atomic_bool is_range_del_table_empty_;
7c673cae
FG
453
454 // Total data size of all data inserted
455 std::atomic<uint64_t> data_size_;
456 std::atomic<uint64_t> num_entries_;
457 std::atomic<uint64_t> num_deletes_;
458
11fdf7f2
TL
459 // Dynamically changeable memtable option
460 std::atomic<size_t> write_buffer_size_;
461
7c673cae
FG
462 // These are used to manage memtable flushes to storage
463 bool flush_in_progress_; // started the flush
464 bool flush_completed_; // finished the flush
465 uint64_t file_number_; // filled up after flush is complete
466
467 // The updates to be applied to the transaction log when this
468 // memtable is flushed to storage.
469 VersionEdit edit_;
470
471 // The sequence number of the kv that was inserted first
472 std::atomic<SequenceNumber> first_seqno_;
473
474 // The db sequence number at the time of creation or kMaxSequenceNumber
475 // if not set.
476 std::atomic<SequenceNumber> earliest_seqno_;
477
478 SequenceNumber creation_seq_;
479
480 // The log files earlier than this number can be deleted.
481 uint64_t mem_next_logfile_number_;
482
483 // the earliest log containing a prepared section
484 // which has been inserted into this memtable.
485 std::atomic<uint64_t> min_prep_log_referenced_;
486
487 // rw locks for inplace updates
488 std::vector<port::RWMutex> locks_;
489
490 const SliceTransform* const prefix_extractor_;
494da23a 491 std::unique_ptr<DynamicBloom> bloom_filter_;
7c673cae
FG
492
493 std::atomic<FlushStateEnum> flush_state_;
494
495 Env* env_;
496
497 // Extract sequential insert prefixes.
498 const SliceTransform* insert_with_hint_prefix_extractor_;
499
500 // Insert hints for each prefix.
501 std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
502
11fdf7f2
TL
503 // Timestamp of oldest key
504 std::atomic<uint64_t> oldest_key_time_;
505
506 // Memtable id to track flush.
507 uint64_t id_ = 0;
508
494da23a
TL
509 // Sequence number of the atomic flush that is responsible for this memtable.
510 // The sequence number of atomic flush is a seq, such that no writes with
511 // sequence numbers greater than or equal to seq are flushed, while all
512 // writes with sequence number smaller than seq are flushed.
513 SequenceNumber atomic_flush_seqno_;
514
f67539c2
TL
515 // keep track of memory usage in table_, arena_, and range_del_table_.
516 // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
517 std::atomic<uint64_t> approximate_memory_usage_;
518
519#ifndef ROCKSDB_LITE
520 // Flush job info of the current memtable.
521 std::unique_ptr<FlushJobInfo> flush_job_info_;
522#endif // !ROCKSDB_LITE
523
7c673cae 524 // Returns a heuristic flush decision
f67539c2 525 bool ShouldFlushNow();
7c673cae
FG
526
527 // Updates flush_state_ using ShouldFlushNow()
528 void UpdateFlushState();
529
11fdf7f2
TL
530 void UpdateOldestKeyTime();
531
f67539c2
TL
532 void GetFromTable(const LookupKey& key,
533 SequenceNumber max_covering_tombstone_seq, bool do_merge,
534 ReadCallback* callback, bool* is_blob_index,
535 std::string* value, Status* s, MergeContext* merge_context,
536 SequenceNumber* seq, bool* found_final_value,
537 bool* merge_in_progress);
7c673cae
FG
538};
539
540extern const char* EncodeKey(std::string* scratch, const Slice& target);
541
f67539c2 542} // namespace ROCKSDB_NAMESPACE