]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / db / memtable.h
CommitLineData
7c673cae
FG
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under the BSD-style license found in the
3// LICENSE file in the root directory of this source tree. An additional grant
4// of patent rights can be found in the PATENTS file in the same directory.
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <atomic>
12#include <deque>
13#include <functional>
14#include <memory>
15#include <string>
16#include <unordered_map>
17#include <vector>
18#include "db/dbformat.h"
19#include "db/range_del_aggregator.h"
20#include "db/version_edit.h"
21#include "memtable/memtable_allocator.h"
22#include "monitoring/instrumented_mutex.h"
23#include "options/cf_options.h"
24#include "rocksdb/db.h"
25#include "rocksdb/env.h"
26#include "rocksdb/memtablerep.h"
27#include "util/concurrent_arena.h"
28#include "util/dynamic_bloom.h"
29#include "util/hash.h"
30
31namespace rocksdb {
32
33class Mutex;
34class MemTableIterator;
35class MergeContext;
36class InternalIterator;
37
38struct MemTableOptions {
39 explicit MemTableOptions(
40 const ImmutableCFOptions& ioptions,
41 const MutableCFOptions& mutable_cf_options);
42 size_t write_buffer_size;
43 size_t arena_block_size;
44 uint32_t memtable_prefix_bloom_bits;
45 size_t memtable_huge_page_size;
46 bool inplace_update_support;
47 size_t inplace_update_num_locks;
48 UpdateStatus (*inplace_callback)(char* existing_value,
49 uint32_t* existing_value_size,
50 Slice delta_value,
51 std::string* merged_value);
52 size_t max_successive_merges;
53 Statistics* statistics;
54 MergeOperator* merge_operator;
55 Logger* info_log;
56};
57
58// Batched counters to updated when inserting keys in one write batch.
59// In post process of the write batch, these can be updated together.
60// Only used in concurrent memtable insert case.
61struct MemTablePostProcessInfo {
62 uint64_t data_size = 0;
63 uint64_t num_entries = 0;
64 uint64_t num_deletes = 0;
65};
66
67// Note: Many of the methods in this class have comments indicating that
68// external synchromization is required as these methods are not thread-safe.
69// It is up to higher layers of code to decide how to prevent concurrent
70// invokation of these methods. This is usually done by acquiring either
71// the db mutex or the single writer thread.
72//
73// Some of these methods are documented to only require external
74// synchronization if this memtable is immutable. Calling MarkImmutable() is
75// not sufficient to guarantee immutability. It is up to higher layers of
76// code to determine if this MemTable can still be modified by other threads.
77// Eg: The Superversion stores a pointer to the current MemTable (that can
78// be modified) and a separate list of the MemTables that can no longer be
79// written to (aka the 'immutable memtables').
80class MemTable {
81 public:
82 struct KeyComparator : public MemTableRep::KeyComparator {
83 const InternalKeyComparator comparator;
84 explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
85 virtual int operator()(const char* prefix_len_key1,
86 const char* prefix_len_key2) const override;
87 virtual int operator()(const char* prefix_len_key,
88 const Slice& key) const override;
89 };
90
91 // MemTables are reference counted. The initial reference count
92 // is zero and the caller must call Ref() at least once.
93 //
94 // earliest_seq should be the current SequenceNumber in the db such that any
95 // key inserted into this memtable will have an equal or larger seq number.
96 // (When a db is first created, the earliest sequence number will be 0).
97 // If the earliest sequence number is not known, kMaxSequenceNumber may be
98 // used, but this may prevent some transactions from succeeding until the
99 // first key is inserted into the memtable.
100 explicit MemTable(const InternalKeyComparator& comparator,
101 const ImmutableCFOptions& ioptions,
102 const MutableCFOptions& mutable_cf_options,
103 WriteBufferManager* write_buffer_manager,
104 SequenceNumber earliest_seq);
105
106 // Do not delete this MemTable unless Unref() indicates it not in use.
107 ~MemTable();
108
109 // Increase reference count.
110 // REQUIRES: external synchronization to prevent simultaneous
111 // operations on the same MemTable.
112 void Ref() { ++refs_; }
113
114 // Drop reference count.
115 // If the refcount goes to zero return this memtable, otherwise return null.
116 // REQUIRES: external synchronization to prevent simultaneous
117 // operations on the same MemTable.
118 MemTable* Unref() {
119 --refs_;
120 assert(refs_ >= 0);
121 if (refs_ <= 0) {
122 return this;
123 }
124 return nullptr;
125 }
126
127 // Returns an estimate of the number of bytes of data in use by this
128 // data structure.
129 //
130 // REQUIRES: external synchronization to prevent simultaneous
131 // operations on the same MemTable (unless this Memtable is immutable).
132 size_t ApproximateMemoryUsage();
133
134 // This method heuristically determines if the memtable should continue to
135 // host more data.
136 bool ShouldScheduleFlush() const {
137 return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
138 }
139
140 // Returns true if a flush should be scheduled and the caller should
141 // be the one to schedule it
142 bool MarkFlushScheduled() {
143 auto before = FLUSH_REQUESTED;
144 return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
145 std::memory_order_relaxed,
146 std::memory_order_relaxed);
147 }
148
149 // Return an iterator that yields the contents of the memtable.
150 //
151 // The caller must ensure that the underlying MemTable remains live
152 // while the returned iterator is live. The keys returned by this
153 // iterator are internal keys encoded by AppendInternalKey in the
154 // db/dbformat.{h,cc} module.
155 //
156 // By default, it returns an iterator for prefix seek if prefix_extractor
157 // is configured in Options.
158 // arena: If not null, the arena needs to be used to allocate the Iterator.
159 // Calling ~Iterator of the iterator will destroy all the states but
160 // those allocated in arena.
161 InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
162
163 InternalIterator* NewRangeTombstoneIterator(const ReadOptions& read_options);
164
165 // Add an entry into memtable that maps key to value at the
166 // specified sequence number and with the specified type.
167 // Typically value will be empty if type==kTypeDeletion.
168 //
169 // REQUIRES: if allow_concurrent = false, external synchronization to prevent
170 // simultaneous operations on the same MemTable.
171 void Add(SequenceNumber seq, ValueType type, const Slice& key,
172 const Slice& value, bool allow_concurrent = false,
173 MemTablePostProcessInfo* post_process_info = nullptr);
174
175 // If memtable contains a value for key, store it in *value and return true.
176 // If memtable contains a deletion for key, store a NotFound() error
177 // in *status and return true.
178 // If memtable contains Merge operation as the most recent entry for a key,
179 // and the merge process does not stop (not reaching a value or delete),
180 // prepend the current merge operand to *operands.
181 // store MergeInProgress in s, and return false.
182 // Else, return false.
183 // If any operation was found, its most recent sequence number
184 // will be stored in *seq on success (regardless of whether true/false is
185 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
186 // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
187 // status returned indicates a corruption or other unexpected error.
188 bool Get(const LookupKey& key, std::string* value, Status* s,
189 MergeContext* merge_context, RangeDelAggregator* range_del_agg,
190 SequenceNumber* seq, const ReadOptions& read_opts);
191
192 bool Get(const LookupKey& key, std::string* value, Status* s,
193 MergeContext* merge_context, RangeDelAggregator* range_del_agg,
194 const ReadOptions& read_opts) {
195 SequenceNumber seq;
196 return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts);
197 }
198
199 // Attempts to update the new_value inplace, else does normal Add
200 // Pseudocode
201 // if key exists in current memtable && prev_value is of type kTypeValue
202 // if new sizeof(new_value) <= sizeof(prev_value)
203 // update inplace
204 // else add(key, new_value)
205 // else add(key, new_value)
206 //
207 // REQUIRES: external synchronization to prevent simultaneous
208 // operations on the same MemTable.
209 void Update(SequenceNumber seq,
210 const Slice& key,
211 const Slice& value);
212
213 // If prev_value for key exists, attempts to update it inplace.
214 // else returns false
215 // Pseudocode
216 // if key exists in current memtable && prev_value is of type kTypeValue
217 // new_value = delta(prev_value)
218 // if sizeof(new_value) <= sizeof(prev_value)
219 // update inplace
220 // else add(key, new_value)
221 // else return false
222 //
223 // REQUIRES: external synchronization to prevent simultaneous
224 // operations on the same MemTable.
225 bool UpdateCallback(SequenceNumber seq,
226 const Slice& key,
227 const Slice& delta);
228
229 // Returns the number of successive merge entries starting from the newest
230 // entry for the key up to the last non-merge entry or last entry for the
231 // key in the memtable.
232 size_t CountSuccessiveMergeEntries(const LookupKey& key);
233
234 // Update counters and flush status after inserting a whole write batch
235 // Used in concurrent memtable inserts.
236 void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
237 num_entries_.fetch_add(update_counters.num_entries,
238 std::memory_order_relaxed);
239 data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
240 if (update_counters.num_deletes != 0) {
241 num_deletes_.fetch_add(update_counters.num_deletes,
242 std::memory_order_relaxed);
243 }
244 UpdateFlushState();
245 }
246
247 // Get total number of entries in the mem table.
248 // REQUIRES: external synchronization to prevent simultaneous
249 // operations on the same MemTable (unless this Memtable is immutable).
250 uint64_t num_entries() const {
251 return num_entries_.load(std::memory_order_relaxed);
252 }
253
254 // Get total number of deletes in the mem table.
255 // REQUIRES: external synchronization to prevent simultaneous
256 // operations on the same MemTable (unless this Memtable is immutable).
257 uint64_t num_deletes() const {
258 return num_deletes_.load(std::memory_order_relaxed);
259 }
260
261 // Returns the edits area that is needed for flushing the memtable
262 VersionEdit* GetEdits() { return &edit_; }
263
264 // Returns if there is no entry inserted to the mem table.
265 // REQUIRES: external synchronization to prevent simultaneous
266 // operations on the same MemTable (unless this Memtable is immutable).
267 bool IsEmpty() const { return first_seqno_ == 0; }
268
269 // Returns the sequence number of the first element that was inserted
270 // into the memtable.
271 // REQUIRES: external synchronization to prevent simultaneous
272 // operations on the same MemTable (unless this Memtable is immutable).
273 SequenceNumber GetFirstSequenceNumber() {
274 return first_seqno_.load(std::memory_order_relaxed);
275 }
276
277 // Returns the sequence number that is guaranteed to be smaller than or equal
278 // to the sequence number of any key that could be inserted into this
279 // memtable. It can then be assumed that any write with a larger(or equal)
280 // sequence number will be present in this memtable or a later memtable.
281 //
282 // If the earliest sequence number could not be determined,
283 // kMaxSequenceNumber will be returned.
284 SequenceNumber GetEarliestSequenceNumber() {
285 return earliest_seqno_.load(std::memory_order_relaxed);
286 }
287
288 // DB's latest sequence ID when the memtable is created. This number
289 // may be updated to a more recent one before any key is inserted.
290 SequenceNumber GetCreationSeq() const { return creation_seq_; }
291
292 void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
293
294 // Returns the next active logfile number when this memtable is about to
295 // be flushed to storage
296 // REQUIRES: external synchronization to prevent simultaneous
297 // operations on the same MemTable.
298 uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
299
300 // Sets the next active logfile number when this memtable is about to
301 // be flushed to storage
302 // REQUIRES: external synchronization to prevent simultaneous
303 // operations on the same MemTable.
304 void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
305
306 // if this memtable contains data from a committed
307 // two phase transaction we must take note of the
308 // log which contains that data so we can know
309 // when to relese that log
310 void RefLogContainingPrepSection(uint64_t log);
311 uint64_t GetMinLogContainingPrepSection();
312
313 // Notify the underlying storage that no more items will be added.
314 // REQUIRES: external synchronization to prevent simultaneous
315 // operations on the same MemTable.
316 // After MarkImmutable() is called, you should not attempt to
317 // write anything to this MemTable(). (Ie. do not call Add() or Update()).
318 void MarkImmutable() {
319 table_->MarkReadOnly();
320 allocator_.DoneAllocating();
321 }
322
323 // return true if the current MemTableRep supports merge operator.
324 bool IsMergeOperatorSupported() const {
325 return table_->IsMergeOperatorSupported();
326 }
327
328 // return true if the current MemTableRep supports snapshots.
329 // inplace update prevents snapshots,
330 bool IsSnapshotSupported() const {
331 return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
332 }
333
334 struct MemTableStats {
335 uint64_t size;
336 uint64_t count;
337 };
338
339 MemTableStats ApproximateStats(const Slice& start_ikey,
340 const Slice& end_ikey);
341
342 // Get the lock associated for the key
343 port::RWMutex* GetLock(const Slice& key);
344
345 const InternalKeyComparator& GetInternalKeyComparator() const {
346 return comparator_.comparator;
347 }
348
349 const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
350
351 private:
352 enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
353
354 friend class MemTableIterator;
355 friend class MemTableBackwardIterator;
356 friend class MemTableList;
357
358 KeyComparator comparator_;
359 const MemTableOptions moptions_;
360 int refs_;
361 const size_t kArenaBlockSize;
362 ConcurrentArena arena_;
363 MemTableAllocator allocator_;
364 unique_ptr<MemTableRep> table_;
365 unique_ptr<MemTableRep> range_del_table_;
366 bool is_range_del_table_empty_;
367
368 // Total data size of all data inserted
369 std::atomic<uint64_t> data_size_;
370 std::atomic<uint64_t> num_entries_;
371 std::atomic<uint64_t> num_deletes_;
372
373 // These are used to manage memtable flushes to storage
374 bool flush_in_progress_; // started the flush
375 bool flush_completed_; // finished the flush
376 uint64_t file_number_; // filled up after flush is complete
377
378 // The updates to be applied to the transaction log when this
379 // memtable is flushed to storage.
380 VersionEdit edit_;
381
382 // The sequence number of the kv that was inserted first
383 std::atomic<SequenceNumber> first_seqno_;
384
385 // The db sequence number at the time of creation or kMaxSequenceNumber
386 // if not set.
387 std::atomic<SequenceNumber> earliest_seqno_;
388
389 SequenceNumber creation_seq_;
390
391 // The log files earlier than this number can be deleted.
392 uint64_t mem_next_logfile_number_;
393
394 // the earliest log containing a prepared section
395 // which has been inserted into this memtable.
396 std::atomic<uint64_t> min_prep_log_referenced_;
397
398 // rw locks for inplace updates
399 std::vector<port::RWMutex> locks_;
400
401 const SliceTransform* const prefix_extractor_;
402 std::unique_ptr<DynamicBloom> prefix_bloom_;
403
404 std::atomic<FlushStateEnum> flush_state_;
405
406 Env* env_;
407
408 // Extract sequential insert prefixes.
409 const SliceTransform* insert_with_hint_prefix_extractor_;
410
411 // Insert hints for each prefix.
412 std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
413
414 // Returns a heuristic flush decision
415 bool ShouldFlushNow() const;
416
417 // Updates flush_state_ using ShouldFlushNow()
418 void UpdateFlushState();
419
420 // No copying allowed
421 MemTable(const MemTable&);
422 MemTable& operator=(const MemTable&);
423};
424
425extern const char* EncodeKey(std::string* scratch, const Slice& target);
426
427} // namespace rocksdb