]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/rocksdb/db/memtable_list.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
index b56ad4932c4ec2841fc7248d99af6b4bf3af7c33..a6acf6a324c85eabf8c07fdb9c779f15c660b4b6 100644 (file)
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "util/autovector.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
 
-namespace rocksdb {
+namespace ROCKSDB_NAMESPACE {
 
 class ColumnFamilyData;
 class InternalKeyComparator;
@@ -33,6 +33,8 @@ class InstrumentedMutex;
 class MergeIteratorBuilder;
 class MemTableList;
 
+struct FlushJobInfo;
+
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
 // Iterator code paths
@@ -44,7 +46,8 @@ class MemTableListVersion {
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                MemTableListVersion* old = nullptr);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               int max_write_buffer_number_to_maintain);
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
 
   void Ref();
   void Unref(autovector<MemTable*>* to_delete = nullptr);
@@ -71,6 +74,16 @@ class MemTableListVersion {
                read_opts, callback, is_blob_index);
   }
 
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
   // Similar to Get(), but searches the Memtable history of memtables that
   // have already been flushed.  Should only be used from in-memory only
   // queries (such as Transaction validation) as the history may contain
@@ -132,7 +145,7 @@ class MemTableListVersion {
   // REQUIRE: m is an immutable memtable
   void Remove(MemTable* m, autovector<MemTable*>* to_delete);
 
-  void TrimHistory(autovector<MemTable*>* to_delete);
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
@@ -145,6 +158,18 @@ class MemTableListVersion {
 
   void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t ApproximateMemoryUsageExcludingLast() const;
+
+  // Whether this version contains flushed memtables that are only kept around
+  // for transaction conflict checking.
+  bool HasHistory() const { return !memlist_history_.empty(); }
+
+  bool MemtableLimitExceeded(size_t usage);
+
   // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
 
@@ -153,8 +178,10 @@ class MemTableListVersion {
   std::list<MemTable*> memlist_history_;
 
   // Maximum number of MemTables to keep in memory (including both flushed
-  // and not-yet-flushed tables).
   const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
 
   int refs_ = 0;
 
@@ -169,35 +196,42 @@ class MemTableListVersion {
 // recoverability from a crash.
 //
 //
-// Other than imm_flush_needed, this class is not thread-safe and requires
-// external synchronization (such as holding the db mutex or being on the
-// write thread.)
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
 class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge,
-                        int max_write_buffer_number_to_maintain)
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
       : imm_flush_needed(false),
+        imm_trim_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion(&current_memory_usage_,
-                                         max_write_buffer_number_to_maintain)),
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
-        flush_requested_(false) {
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_usage_excluding_last_(0),
+        current_has_history_(false) {
     current_->Ref();
-    current_memory_usage_ = 0;
   }
 
   // Should not delete MemTableList without making sure MemTableList::current()
   // is Unref()'d.
   ~MemTableList() {}
 
-  MemTableListVersion* current() { return current_; }
+  MemTableListVersion* current() const { return current_; }
 
   // so that background threads can detect non-nullptr pointer to
   // determine whether there is anything more to start flushing.
   std::atomic<bool> imm_flush_needed;
 
+  std::atomic<bool> imm_trim_needed;
+
   // Returns the total number of memtables in the list that haven't yet
   // been flushed and logged.
   int NumNotFlushed() const;
@@ -227,7 +261,8 @@ class MemTableList {
       const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
       VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
       autovector<MemTable*>* to_delete, Directory* db_directory,
-      LogBuffer* log_buffer);
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
@@ -236,6 +271,23 @@ class MemTableList {
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
+  // Returns the cached current_memory_usage_excluding_last_ value.
+  size_t ApproximateMemoryUsageExcludingLast() const;
+
+  // Returns the cached current_has_history_ value.
+  bool HasHistory() const;
+
+  // Updates current_memory_usage_excluding_last_ and current_has_history_
+  // from MemTableListVersion. Must be called whenever InstallNewVersion is
+  // called.
+  void UpdateCachedValuesFromMemTableListVersion();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
   // Returns an estimate of the number of bytes of data used by
   // the unflushed mem-tables.
   size_t ApproximateUnflushedMemTablesMemoryUsage();
@@ -252,6 +304,20 @@ class MemTableList {
 
   bool HasFlushRequested() { return flush_requested_; }
 
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
   // Copying allowed
   // MemTableList(const MemTableList&);
   // void operator=(const MemTableList&);
@@ -294,6 +360,13 @@ class MemTableList {
     }
   }
 
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
  private:
   friend Status InstallMemtableAtomicFlushResults(
       const autovector<MemTableList*>* imm_lists,
@@ -324,6 +397,12 @@ class MemTableList {
 
   // The current memory usage.
   size_t current_memory_usage_;
+
+  // Cached value of current_->ApproximateMemoryUsageExcludingLast().
+  std::atomic<size_t> current_memory_usage_excluding_last_;
+
+  // Cached value of current_->HasHistory().
+  std::atomic<bool> current_has_history_;
 };
 
 // Installs memtable atomic flush results.
@@ -340,4 +419,4 @@ extern Status InstallMemtableAtomicFlushResults(
     InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
     autovector<MemTable*>* to_delete, Directory* db_directory,
     LogBuffer* log_buffer);
-}  // namespace rocksdb
+}  // namespace ROCKSDB_NAMESPACE