]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / memtable.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <atomic>
12#include <deque>
13#include <functional>
14#include <memory>
15#include <string>
16#include <unordered_map>
17#include <vector>
18#include "db/dbformat.h"
494da23a 19#include "db/range_tombstone_fragmenter.h"
11fdf7f2 20#include "db/read_callback.h"
7c673cae 21#include "db/version_edit.h"
f67539c2
TL
22#include "memory/allocator.h"
23#include "memory/concurrent_arena.h"
7c673cae
FG
24#include "monitoring/instrumented_mutex.h"
25#include "options/cf_options.h"
26#include "rocksdb/db.h"
27#include "rocksdb/env.h"
28#include "rocksdb/memtablerep.h"
f67539c2 29#include "table/multiget_context.h"
7c673cae
FG
30#include "util/dynamic_bloom.h"
31#include "util/hash.h"
32
f67539c2 33namespace ROCKSDB_NAMESPACE {
7c673cae 34
f67539c2 35struct FlushJobInfo;
7c673cae
FG
36class Mutex;
37class MemTableIterator;
38class MergeContext;
7c673cae 39
11fdf7f2
TL
40struct ImmutableMemTableOptions {
41 explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
42 const MutableCFOptions& mutable_cf_options);
7c673cae
FG
43 size_t arena_block_size;
44 uint32_t memtable_prefix_bloom_bits;
45 size_t memtable_huge_page_size;
494da23a 46 bool memtable_whole_key_filtering;
7c673cae
FG
47 bool inplace_update_support;
48 size_t inplace_update_num_locks;
49 UpdateStatus (*inplace_callback)(char* existing_value,
50 uint32_t* existing_value_size,
51 Slice delta_value,
52 std::string* merged_value);
53 size_t max_successive_merges;
54 Statistics* statistics;
55 MergeOperator* merge_operator;
56 Logger* info_log;
20effc67 57 bool allow_data_in_errors;
7c673cae
FG
58};
59
60// Batched counters to updated when inserting keys in one write batch.
61// In post process of the write batch, these can be updated together.
62// Only used in concurrent memtable insert case.
63struct MemTablePostProcessInfo {
64 uint64_t data_size = 0;
65 uint64_t num_entries = 0;
66 uint64_t num_deletes = 0;
67};
68
f67539c2 69using MultiGetRange = MultiGetContext::Range;
7c673cae 70// Note: Many of the methods in this class have comments indicating that
11fdf7f2 71// external synchronization is required as these methods are not thread-safe.
7c673cae
FG
72// It is up to higher layers of code to decide how to prevent concurrent
73// invokation of these methods. This is usually done by acquiring either
74// the db mutex or the single writer thread.
75//
76// Some of these methods are documented to only require external
77// synchronization if this memtable is immutable. Calling MarkImmutable() is
78// not sufficient to guarantee immutability. It is up to higher layers of
79// code to determine if this MemTable can still be modified by other threads.
80// Eg: The Superversion stores a pointer to the current MemTable (that can
81// be modified) and a separate list of the MemTables that can no longer be
82// written to (aka the 'immutable memtables').
83class MemTable {
84 public:
85 struct KeyComparator : public MemTableRep::KeyComparator {
86 const InternalKeyComparator comparator;
87 explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
88 virtual int operator()(const char* prefix_len_key1,
89 const char* prefix_len_key2) const override;
90 virtual int operator()(const char* prefix_len_key,
11fdf7f2 91 const DecodedType& key) const override;
7c673cae
FG
92 };
93
94 // MemTables are reference counted. The initial reference count
95 // is zero and the caller must call Ref() at least once.
96 //
97 // earliest_seq should be the current SequenceNumber in the db such that any
98 // key inserted into this memtable will have an equal or larger seq number.
99 // (When a db is first created, the earliest sequence number will be 0).
100 // If the earliest sequence number is not known, kMaxSequenceNumber may be
101 // used, but this may prevent some transactions from succeeding until the
102 // first key is inserted into the memtable.
103 explicit MemTable(const InternalKeyComparator& comparator,
104 const ImmutableCFOptions& ioptions,
105 const MutableCFOptions& mutable_cf_options,
106 WriteBufferManager* write_buffer_manager,
11fdf7f2 107 SequenceNumber earliest_seq, uint32_t column_family_id);
f67539c2
TL
108 // No copying allowed
109 MemTable(const MemTable&) = delete;
110 MemTable& operator=(const MemTable&) = delete;
7c673cae
FG
111
112 // Do not delete this MemTable unless Unref() indicates it not in use.
113 ~MemTable();
114
115 // Increase reference count.
116 // REQUIRES: external synchronization to prevent simultaneous
117 // operations on the same MemTable.
118 void Ref() { ++refs_; }
119
120 // Drop reference count.
121 // If the refcount goes to zero return this memtable, otherwise return null.
122 // REQUIRES: external synchronization to prevent simultaneous
123 // operations on the same MemTable.
124 MemTable* Unref() {
125 --refs_;
126 assert(refs_ >= 0);
127 if (refs_ <= 0) {
128 return this;
129 }
130 return nullptr;
131 }
132
133 // Returns an estimate of the number of bytes of data in use by this
134 // data structure.
135 //
136 // REQUIRES: external synchronization to prevent simultaneous
137 // operations on the same MemTable (unless this Memtable is immutable).
138 size_t ApproximateMemoryUsage();
139
f67539c2
TL
140 // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
141 // require external synchronization. The value may be less accurate though
142 size_t ApproximateMemoryUsageFast() const {
143 return approximate_memory_usage_.load(std::memory_order_relaxed);
144 }
145
7c673cae
FG
146 // This method heuristically determines if the memtable should continue to
147 // host more data.
148 bool ShouldScheduleFlush() const {
149 return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
150 }
151
152 // Returns true if a flush should be scheduled and the caller should
153 // be the one to schedule it
154 bool MarkFlushScheduled() {
155 auto before = FLUSH_REQUESTED;
156 return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
157 std::memory_order_relaxed,
158 std::memory_order_relaxed);
159 }
160
161 // Return an iterator that yields the contents of the memtable.
162 //
163 // The caller must ensure that the underlying MemTable remains live
164 // while the returned iterator is live. The keys returned by this
165 // iterator are internal keys encoded by AppendInternalKey in the
166 // db/dbformat.{h,cc} module.
167 //
168 // By default, it returns an iterator for prefix seek if prefix_extractor
169 // is configured in Options.
170 // arena: If not null, the arena needs to be used to allocate the Iterator.
171 // Calling ~Iterator of the iterator will destroy all the states but
172 // those allocated in arena.
173 InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
174
494da23a
TL
175 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
176 const ReadOptions& read_options, SequenceNumber read_seq);
7c673cae
FG
177
178 // Add an entry into memtable that maps key to value at the
179 // specified sequence number and with the specified type.
180 // Typically value will be empty if type==kTypeDeletion.
181 //
182 // REQUIRES: if allow_concurrent = false, external synchronization to prevent
183 // simultaneous operations on the same MemTable.
11fdf7f2
TL
184 //
185 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
186 // the <key, seq> already exists.
187 bool Add(SequenceNumber seq, ValueType type, const Slice& key,
7c673cae 188 const Slice& value, bool allow_concurrent = false,
f67539c2
TL
189 MemTablePostProcessInfo* post_process_info = nullptr,
190 void** hint = nullptr);
7c673cae 191
f67539c2
TL
192 // Used to Get value associated with key or Get Merge Operands associated
193 // with key.
194 // If do_merge = true the default behavior which is Get value for key is
195 // executed. Expected behavior is described right below.
7c673cae
FG
196 // If memtable contains a value for key, store it in *value and return true.
197 // If memtable contains a deletion for key, store a NotFound() error
198 // in *status and return true.
199 // If memtable contains Merge operation as the most recent entry for a key,
200 // and the merge process does not stop (not reaching a value or delete),
201 // prepend the current merge operand to *operands.
202 // store MergeInProgress in s, and return false.
203 // Else, return false.
204 // If any operation was found, its most recent sequence number
205 // will be stored in *seq on success (regardless of whether true/false is
206 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
207 // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
208 // status returned indicates a corruption or other unexpected error.
f67539c2
TL
209 // If do_merge = false then any Merge Operands encountered for key are simply
210 // stored in merge_context.operands_list and never actually merged to get a
211 // final value. The raw Merge Operands are eventually returned to the user.
7c673cae 212 bool Get(const LookupKey& key, std::string* value, Status* s,
494da23a
TL
213 MergeContext* merge_context,
214 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
215 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
20effc67
TL
216 bool* is_blob_index = nullptr, bool do_merge = true) {
217 return Get(key, value, /*timestamp=*/nullptr, s, merge_context,
218 max_covering_tombstone_seq, seq, read_opts, callback,
219 is_blob_index, do_merge);
220 }
221
222 bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
223 Status* s, MergeContext* merge_context,
224 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
225 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
f67539c2 226 bool* is_blob_index = nullptr, bool do_merge = true);
7c673cae 227
20effc67
TL
228 bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
229 Status* s, MergeContext* merge_context,
494da23a 230 SequenceNumber* max_covering_tombstone_seq,
11fdf7f2 231 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
f67539c2 232 bool* is_blob_index = nullptr, bool do_merge = true) {
7c673cae 233 SequenceNumber seq;
20effc67
TL
234 return Get(key, value, timestamp, s, merge_context,
235 max_covering_tombstone_seq, &seq, read_opts, callback,
236 is_blob_index, do_merge);
7c673cae
FG
237 }
238
f67539c2
TL
239 void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
240 ReadCallback* callback, bool* is_blob);
241
7c673cae
FG
242 // Attempts to update the new_value inplace, else does normal Add
243 // Pseudocode
244 // if key exists in current memtable && prev_value is of type kTypeValue
245 // if new sizeof(new_value) <= sizeof(prev_value)
246 // update inplace
247 // else add(key, new_value)
248 // else add(key, new_value)
249 //
250 // REQUIRES: external synchronization to prevent simultaneous
251 // operations on the same MemTable.
252 void Update(SequenceNumber seq,
253 const Slice& key,
254 const Slice& value);
255
256 // If prev_value for key exists, attempts to update it inplace.
257 // else returns false
258 // Pseudocode
259 // if key exists in current memtable && prev_value is of type kTypeValue
260 // new_value = delta(prev_value)
261 // if sizeof(new_value) <= sizeof(prev_value)
262 // update inplace
263 // else add(key, new_value)
264 // else return false
265 //
266 // REQUIRES: external synchronization to prevent simultaneous
267 // operations on the same MemTable.
268 bool UpdateCallback(SequenceNumber seq,
269 const Slice& key,
270 const Slice& delta);
271
272 // Returns the number of successive merge entries starting from the newest
273 // entry for the key up to the last non-merge entry or last entry for the
274 // key in the memtable.
275 size_t CountSuccessiveMergeEntries(const LookupKey& key);
276
277 // Update counters and flush status after inserting a whole write batch
278 // Used in concurrent memtable inserts.
279 void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
280 num_entries_.fetch_add(update_counters.num_entries,
281 std::memory_order_relaxed);
282 data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
283 if (update_counters.num_deletes != 0) {
284 num_deletes_.fetch_add(update_counters.num_deletes,
285 std::memory_order_relaxed);
286 }
287 UpdateFlushState();
288 }
289
290 // Get total number of entries in the mem table.
291 // REQUIRES: external synchronization to prevent simultaneous
292 // operations on the same MemTable (unless this Memtable is immutable).
293 uint64_t num_entries() const {
294 return num_entries_.load(std::memory_order_relaxed);
295 }
296
297 // Get total number of deletes in the mem table.
298 // REQUIRES: external synchronization to prevent simultaneous
299 // operations on the same MemTable (unless this Memtable is immutable).
300 uint64_t num_deletes() const {
301 return num_deletes_.load(std::memory_order_relaxed);
302 }
303
494da23a
TL
304 uint64_t get_data_size() const {
305 return data_size_.load(std::memory_order_relaxed);
306 }
307
11fdf7f2
TL
308 // Dynamically change the memtable's capacity. If set below the current usage,
309 // the next key added will trigger a flush. Can only increase size when
310 // memtable prefix bloom is disabled, since we can't easily allocate more
311 // space.
312 void UpdateWriteBufferSize(size_t new_write_buffer_size) {
494da23a 313 if (bloom_filter_ == nullptr ||
11fdf7f2
TL
314 new_write_buffer_size < write_buffer_size_) {
315 write_buffer_size_.store(new_write_buffer_size,
316 std::memory_order_relaxed);
317 }
318 }
319
7c673cae
FG
320 // Returns the edits area that is needed for flushing the memtable
321 VersionEdit* GetEdits() { return &edit_; }
322
323 // Returns if there is no entry inserted to the mem table.
324 // REQUIRES: external synchronization to prevent simultaneous
325 // operations on the same MemTable (unless this Memtable is immutable).
326 bool IsEmpty() const { return first_seqno_ == 0; }
327
328 // Returns the sequence number of the first element that was inserted
329 // into the memtable.
330 // REQUIRES: external synchronization to prevent simultaneous
331 // operations on the same MemTable (unless this Memtable is immutable).
332 SequenceNumber GetFirstSequenceNumber() {
333 return first_seqno_.load(std::memory_order_relaxed);
334 }
335
336 // Returns the sequence number that is guaranteed to be smaller than or equal
337 // to the sequence number of any key that could be inserted into this
338 // memtable. It can then be assumed that any write with a larger(or equal)
339 // sequence number will be present in this memtable or a later memtable.
340 //
341 // If the earliest sequence number could not be determined,
342 // kMaxSequenceNumber will be returned.
343 SequenceNumber GetEarliestSequenceNumber() {
344 return earliest_seqno_.load(std::memory_order_relaxed);
345 }
346
347 // DB's latest sequence ID when the memtable is created. This number
348 // may be updated to a more recent one before any key is inserted.
349 SequenceNumber GetCreationSeq() const { return creation_seq_; }
350
351 void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
352
353 // Returns the next active logfile number when this memtable is about to
354 // be flushed to storage
355 // REQUIRES: external synchronization to prevent simultaneous
356 // operations on the same MemTable.
357 uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
358
359 // Sets the next active logfile number when this memtable is about to
360 // be flushed to storage
361 // REQUIRES: external synchronization to prevent simultaneous
362 // operations on the same MemTable.
363 void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
364
365 // if this memtable contains data from a committed
366 // two phase transaction we must take note of the
367 // log which contains that data so we can know
368 // when to relese that log
369 void RefLogContainingPrepSection(uint64_t log);
370 uint64_t GetMinLogContainingPrepSection();
371
372 // Notify the underlying storage that no more items will be added.
373 // REQUIRES: external synchronization to prevent simultaneous
374 // operations on the same MemTable.
375 // After MarkImmutable() is called, you should not attempt to
376 // write anything to this MemTable(). (Ie. do not call Add() or Update()).
377 void MarkImmutable() {
378 table_->MarkReadOnly();
11fdf7f2
TL
379 mem_tracker_.DoneAllocating();
380 }
381
382 // Notify the underlying storage that all data it contained has been
383 // persisted.
384 // REQUIRES: external synchronization to prevent simultaneous
385 // operations on the same MemTable.
386 void MarkFlushed() {
387 table_->MarkFlushed();
7c673cae
FG
388 }
389
390 // return true if the current MemTableRep supports merge operator.
391 bool IsMergeOperatorSupported() const {
392 return table_->IsMergeOperatorSupported();
393 }
394
395 // return true if the current MemTableRep supports snapshots.
396 // inplace update prevents snapshots,
397 bool IsSnapshotSupported() const {
398 return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
399 }
400
401 struct MemTableStats {
402 uint64_t size;
403 uint64_t count;
404 };
405
406 MemTableStats ApproximateStats(const Slice& start_ikey,
407 const Slice& end_ikey);
408
409 // Get the lock associated for the key
410 port::RWMutex* GetLock(const Slice& key);
411
412 const InternalKeyComparator& GetInternalKeyComparator() const {
413 return comparator_.comparator;
414 }
415
11fdf7f2
TL
416 const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
417 return &moptions_;
418 }
419
420 uint64_t ApproximateOldestKeyTime() const {
421 return oldest_key_time_.load(std::memory_order_relaxed);
422 }
423
424 // REQUIRES: db_mutex held.
425 void SetID(uint64_t id) { id_ = id; }
426
427 uint64_t GetID() const { return id_; }
7c673cae 428
494da23a
TL
429 void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
430
431 uint64_t GetFileNumber() const { return file_number_; }
432
433 void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
434
435 void SetFlushInProgress(bool in_progress) {
436 flush_in_progress_ = in_progress;
437 }
438
f67539c2
TL
439#ifndef ROCKSDB_LITE
440 void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
441 flush_job_info_ = std::move(info);
442 }
443
444 std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
445 return std::move(flush_job_info_);
446 }
447#endif // !ROCKSDB_LITE
448
7c673cae
FG
449 private:
450 enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
451
452 friend class MemTableIterator;
453 friend class MemTableBackwardIterator;
454 friend class MemTableList;
455
456 KeyComparator comparator_;
11fdf7f2 457 const ImmutableMemTableOptions moptions_;
7c673cae
FG
458 int refs_;
459 const size_t kArenaBlockSize;
11fdf7f2 460 AllocTracker mem_tracker_;
7c673cae 461 ConcurrentArena arena_;
494da23a
TL
462 std::unique_ptr<MemTableRep> table_;
463 std::unique_ptr<MemTableRep> range_del_table_;
464 std::atomic_bool is_range_del_table_empty_;
7c673cae
FG
465
466 // Total data size of all data inserted
467 std::atomic<uint64_t> data_size_;
468 std::atomic<uint64_t> num_entries_;
469 std::atomic<uint64_t> num_deletes_;
470
11fdf7f2
TL
471 // Dynamically changeable memtable option
472 std::atomic<size_t> write_buffer_size_;
473
7c673cae
FG
474 // These are used to manage memtable flushes to storage
475 bool flush_in_progress_; // started the flush
476 bool flush_completed_; // finished the flush
477 uint64_t file_number_; // filled up after flush is complete
478
479 // The updates to be applied to the transaction log when this
480 // memtable is flushed to storage.
481 VersionEdit edit_;
482
483 // The sequence number of the kv that was inserted first
484 std::atomic<SequenceNumber> first_seqno_;
485
486 // The db sequence number at the time of creation or kMaxSequenceNumber
487 // if not set.
488 std::atomic<SequenceNumber> earliest_seqno_;
489
490 SequenceNumber creation_seq_;
491
492 // The log files earlier than this number can be deleted.
493 uint64_t mem_next_logfile_number_;
494
495 // the earliest log containing a prepared section
496 // which has been inserted into this memtable.
497 std::atomic<uint64_t> min_prep_log_referenced_;
498
499 // rw locks for inplace updates
500 std::vector<port::RWMutex> locks_;
501
502 const SliceTransform* const prefix_extractor_;
494da23a 503 std::unique_ptr<DynamicBloom> bloom_filter_;
7c673cae
FG
504
505 std::atomic<FlushStateEnum> flush_state_;
506
507 Env* env_;
508
509 // Extract sequential insert prefixes.
510 const SliceTransform* insert_with_hint_prefix_extractor_;
511
512 // Insert hints for each prefix.
513 std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
514
11fdf7f2
TL
515 // Timestamp of oldest key
516 std::atomic<uint64_t> oldest_key_time_;
517
518 // Memtable id to track flush.
519 uint64_t id_ = 0;
520
494da23a
TL
521 // Sequence number of the atomic flush that is responsible for this memtable.
522 // The sequence number of atomic flush is a seq, such that no writes with
523 // sequence numbers greater than or equal to seq are flushed, while all
524 // writes with sequence number smaller than seq are flushed.
525 SequenceNumber atomic_flush_seqno_;
526
f67539c2
TL
527 // keep track of memory usage in table_, arena_, and range_del_table_.
528 // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
529 std::atomic<uint64_t> approximate_memory_usage_;
530
531#ifndef ROCKSDB_LITE
532 // Flush job info of the current memtable.
533 std::unique_ptr<FlushJobInfo> flush_job_info_;
534#endif // !ROCKSDB_LITE
535
7c673cae 536 // Returns a heuristic flush decision
f67539c2 537 bool ShouldFlushNow();
7c673cae
FG
538
539 // Updates flush_state_ using ShouldFlushNow()
540 void UpdateFlushState();
541
11fdf7f2
TL
542 void UpdateOldestKeyTime();
543
f67539c2
TL
544 void GetFromTable(const LookupKey& key,
545 SequenceNumber max_covering_tombstone_seq, bool do_merge,
546 ReadCallback* callback, bool* is_blob_index,
20effc67
TL
547 std::string* value, std::string* timestamp, Status* s,
548 MergeContext* merge_context, SequenceNumber* seq,
549 bool* found_final_value, bool* merge_in_progress);
7c673cae
FG
550};
551
552extern const char* EncodeKey(std::string* scratch, const Slice& target);
553
f67539c2 554} // namespace ROCKSDB_NAMESPACE