]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/rocksdb/db/memtable_list.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
index 0af7a0cea41b5b3f5f503723841d1dbb25bb4c44..1ad28a59e250612dc4551becdad442307590b650 100644 (file)
@@ -12,7 +12,6 @@
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
@@ -58,25 +57,27 @@ class MemTableListVersion {
   // If any operation was found for this key, its most recent sequence number
   // will be stored in *seq on success (regardless of whether true/false is
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
-  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
-           Status* s, MergeContext* merge_context,
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr);
 
-  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
-           Status* s, MergeContext* merge_context,
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, timestamp, s, merge_context,
+    return Get(key, value, columns, timestamp, s, merge_context,
                max_covering_tombstone_seq, &seq, read_opts, callback,
                is_blob_index);
   }
 
   void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                ReadCallback* callback, bool* is_blob);
+                ReadCallback* callback);
 
   // Returns all the merge operands corresponding to the key by searching all
   // memtables starting from the most recent one.
@@ -90,19 +91,19 @@ class MemTableListVersion {
   // queries (such as Transaction validation) as the history may contain
   // writes that are also present in the SST files.
   bool GetFromHistory(const LookupKey& key, std::string* value,
-                      std::string* timestamp, Status* s,
-                      MergeContext* merge_context,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
                       SequenceNumber* max_covering_tombstone_seq,
                       SequenceNumber* seq, const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr);
   bool GetFromHistory(const LookupKey& key, std::string* value,
-                      std::string* timestamp, Status* s,
-                      MergeContext* merge_context,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
                       SequenceNumber* max_covering_tombstone_seq,
                       const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return GetFromHistory(key, value, timestamp, s, merge_context,
+    return GetFromHistory(key, value, columns, timestamp, s, merge_context,
                           max_covering_tombstone_seq, &seq, read_opts,
                           is_blob_index);
   }
@@ -115,7 +116,8 @@ class MemTableListVersion {
                     Arena* arena);
 
   void AddIterators(const ReadOptions& options,
-                    MergeIteratorBuilder* merge_iter_builder);
+                    MergeIteratorBuilder* merge_iter_builder,
+                    bool add_range_tombstone_iter);
 
   uint64_t GetTotalNumEntries() const;
 
@@ -130,6 +132,11 @@ class MemTableListVersion {
   // History.
   SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
 
+  // Return the first sequence number from the memtable list, which is the
+  // smallest sequence number of all FirstSequenceNumber.
+  // Return kMaxSequenceNumber if the list is empty.
+  SequenceNumber GetFirstSequenceNumber() const;
+
  private:
   friend class MemTableList;
 
@@ -138,8 +145,10 @@ class MemTableListVersion {
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
       autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
@@ -152,7 +161,8 @@ class MemTableListVersion {
   bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
-                   std::string* value, std::string* timestamp, Status* s,
+                   std::string* value, PinnableWideColumns* columns,
+                   std::string* timestamp, Status* s,
                    MergeContext* merge_context,
                    SequenceNumber* max_covering_tombstone_seq,
                    SequenceNumber* seq, const ReadOptions& read_opts,
@@ -167,7 +177,7 @@ class MemTableListVersion {
   // excluding the last MemTable in memlist_history_. The reason for excluding
   // the last MemTable is to see if dropping the last MemTable will keep total
   // memory usage above or equal to max_write_buffer_size_to_maintain_
-  size_t ApproximateMemoryUsageExcludingLast() const;
+  size_t MemoryAllocatedBytesExcludingLast() const;
 
   // Whether this version contains flushed memtables that are only kept around
   // for transaction conflict checking.
@@ -220,7 +230,7 @@ class MemTableList {
         commit_in_progress_(false),
         flush_requested_(false),
         current_memory_usage_(0),
-        current_memory_usage_excluding_last_(0),
+        current_memory_allocted_bytes_excluding_last_(0),
         current_has_history_(false) {
     current_->Ref();
   }
@@ -249,10 +259,15 @@ class MemTableList {
   // not yet started.
   bool IsFlushPending() const;
 
+  // Returns true if there is at least one memtable that is pending flush or
+  // flushing.
+  bool IsFlushPendingOrRunning() const;
+
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
-                            autovector<MemTable*>* mems);
+  void PickMemtablesToFlush(uint64_t max_memtable_id,
+                            autovector<MemTable*>* mems,
+                            uint64_t* max_next_log_number = nullptr);
 
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
@@ -268,24 +283,27 @@ class MemTableList {
       autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer,
       std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
-      IOStatus* io_s);
+      bool write_edits = true);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
+  // By default, adding memtables will flag that the memtable list needs to be
+  // flushed, but in certain situations, like after a mempurge, we may want to
+  // avoid flushing the memtable list upon addition of a memtable.
   void Add(MemTable* m, autovector<MemTable*>* to_delete);
 
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
-  // Returns the cached current_memory_usage_excluding_last_ value.
-  size_t ApproximateMemoryUsageExcludingLast() const;
+  // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
+  size_t MemoryAllocatedBytesExcludingLast() const;
 
   // Returns the cached current_has_history_ value.
   bool HasHistory() const;
 
-  // Updates current_memory_usage_excluding_last_ and current_has_history_
-  // from MemTableListVersion. Must be called whenever InstallNewVersion is
-  // called.
+  // Updates current_memory_allocted_bytes_excluding_last_ and
+  // current_has_history_ from MemTableListVersion. Must be called whenever
+  // InstallNewVersion is called.
   void UpdateCachedValuesFromMemTableListVersion();
 
   // `usage` is the current size of the mutable Memtable. When
@@ -308,7 +326,18 @@ class MemTableList {
   // non-empty (regardless of the min_write_buffer_number_to_merge
   // parameter). This flush request will persist until the next time
   // PickMemtablesToFlush() is called.
-  void FlushRequested() { flush_requested_ = true; }
+  void FlushRequested() {
+    flush_requested_ = true;
+    // If there are some memtables stored in imm() that don't trigger
+    // flush (eg: mempurge output memtable), then update imm_flush_needed.
+    // Note: if race condition and imm_flush_needed is set to true
+    // when there is num_flush_not_started_==0, then there is no
+    // impact whatsoever. Imm_flush_needed is only used in an assert
+    // in IsFlushPending().
+    if (num_flush_not_started_ > 0) {
+      imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
 
   bool HasFlushRequested() { return flush_requested_; }
 
@@ -335,7 +364,7 @@ class MemTableList {
   // Returns the min log containing the prep section after memtables listsed in
   // `memtables_to_flush` are flushed and their status is persisted in manifest.
   uint64_t PrecomputeMinLogContainingPrepSection(
-      const autovector<MemTable*>& memtables_to_flush);
+      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
 
   uint64_t GetEarliestMemTableID() const {
     auto& memlist = current_->memlist_;
@@ -381,8 +410,10 @@ class MemTableList {
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
       autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
@@ -413,8 +444,8 @@ class MemTableList {
   // The current memory usage.
   size_t current_memory_usage_;
 
-  // Cached value of current_->ApproximateMemoryUsageExcludingLast().
-  std::atomic<size_t> current_memory_usage_excluding_last_;
+  // Cached value of current_->MemoryAllocatedBytesExcludingLast().
+  std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
 
   // Cached value of current_->HasHistory().
   std::atomic<bool> current_has_history_;
@@ -431,7 +462,10 @@ extern Status InstallMemtableAtomicFlushResults(
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
-    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_meta,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
     autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer);
 }  // namespace ROCKSDB_NAMESPACE