ceph/src/rocksdb/db/db_impl.h

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9 #pragma once
  10
  11 #include <atomic>
  12 #include <deque>
  13 #include <functional>
  14 #include <limits>
  15 #include <list>
  16 #include <map>
  17 #include <set>
  18 #include <string>
  19 #include <utility>
  20 #include <vector>
  21
  22 #include "db/column_family.h"
  23 #include "db/compaction_job.h"
  24 #include "db/dbformat.h"
  25 #include "db/error_handler.h"
  26 #include "db/event_helpers.h"
  27 #include "db/external_sst_file_ingestion_job.h"
  28 #include "db/flush_job.h"
  29 #include "db/flush_scheduler.h"
  30 #include "db/internal_stats.h"
  31 #include "db/log_writer.h"
  32 #include "db/logs_with_prep_tracker.h"
  33 #include "db/pre_release_callback.h"
  34 #include "db/range_del_aggregator.h"
  35 #include "db/read_callback.h"
  36 #include "db/snapshot_checker.h"
  37 #include "db/snapshot_impl.h"
  38 #include "db/version_edit.h"
  39 #include "db/wal_manager.h"
  40 #include "db/write_controller.h"
  41 #include "db/write_thread.h"
  42 #include "memtable_list.h"
  43 #include "monitoring/instrumented_mutex.h"
  44 #include "options/db_options.h"
  45 #include "port/port.h"
  46 #include "rocksdb/db.h"
  47 #include "rocksdb/env.h"
  48 #include "rocksdb/memtablerep.h"
  49 #include "rocksdb/status.h"
  50 #include "rocksdb/trace_reader_writer.h"
  51 #include "rocksdb/transaction_log.h"
  52 #include "rocksdb/write_buffer_manager.h"
  53 #include "table/scoped_arena_iterator.h"
  54 #include "util/autovector.h"
  55 #include "util/event_logger.h"
  56 #include "util/hash.h"
  57 #include "util/repeatable_thread.h"
  58 #include "util/stop_watch.h"
  59 #include "util/thread_local.h"
  60 #include "util/trace_replay.h"
  61
  62 namespace rocksdb {
  63
  64 class Arena;
  65 class ArenaWrappedDBIter;
  66 class InMemoryStatsHistoryIterator;
  67 class MemTable;
  68 class TableCache;
  69 class TaskLimiterToken;
  70 class Version;
  71 class VersionEdit;
  72 class VersionSet;
  73 class WriteCallback;
  74 struct JobContext;
  75 struct ExternalSstFileInfo;
  76 struct MemTableInfo;
  77
  78 class DBImpl : public DB {
  79  public:
  80   DBImpl(const DBOptions& options, const std::string& dbname,
  81          const bool seq_per_batch = false, const bool batch_per_txn = true);
  82   virtual ~DBImpl();
  83
  84   using DB::Resume;
  85   virtual Status Resume() override;
  86
  87   // Implementations of the DB interface
  88   using DB::Put;
  89   virtual Status Put(const WriteOptions& options,
  90                      ColumnFamilyHandle* column_family, const Slice& key,
  91                      const Slice& value) override;
  92   using DB::Merge;
  93   virtual Status Merge(const WriteOptions& options,
  94                        ColumnFamilyHandle* column_family, const Slice& key,
  95                        const Slice& value) override;
  96   using DB::Delete;
  97   virtual Status Delete(const WriteOptions& options,
  98                         ColumnFamilyHandle* column_family,
  99                         const Slice& key) override;
 100   using DB::SingleDelete;
 101   virtual Status SingleDelete(const WriteOptions& options,
 102                               ColumnFamilyHandle* column_family,
 103                               const Slice& key) override;
 104   using DB::Write;
 105   virtual Status Write(const WriteOptions& options,
 106                        WriteBatch* updates) override;
 107
 108   using DB::Get;
 109   virtual Status Get(const ReadOptions& options,
 110                      ColumnFamilyHandle* column_family, const Slice& key,
 111                      PinnableSlice* value) override;
 112
 113   // Function that Get and KeyMayExist call with no_io true or false
 114   // Note: 'value_found' from KeyMayExist propagates here
 115   Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
 116                  const Slice& key, PinnableSlice* value,
 117                  bool* value_found = nullptr, ReadCallback* callback = nullptr,
 118                  bool* is_blob_index = nullptr);
 119
 120   using DB::MultiGet;
 121   virtual std::vector<Status> MultiGet(
 122       const ReadOptions& options,
 123       const std::vector<ColumnFamilyHandle*>& column_family,
 124       const std::vector<Slice>& keys,
 125       std::vector<std::string>* values) override;
 126
 127   virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
 128                                     const std::string& column_family,
 129                                     ColumnFamilyHandle** handle) override;
 130   virtual Status CreateColumnFamilies(
 131       const ColumnFamilyOptions& cf_options,
 132       const std::vector<std::string>& column_family_names,
 133       std::vector<ColumnFamilyHandle*>* handles) override;
 134   virtual Status CreateColumnFamilies(
 135       const std::vector<ColumnFamilyDescriptor>& column_families,
 136       std::vector<ColumnFamilyHandle*>* handles) override;
 137   virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
 138   virtual Status DropColumnFamilies(
 139       const std::vector<ColumnFamilyHandle*>& column_families) override;
 140
 141   // Returns false if key doesn't exist in the database and true if it may.
 142   // If value_found is not passed in as null, then return the value if found in
 143   // memory. On return, if value was found, then value_found will be set to true
 144   // , otherwise false.
 145   using DB::KeyMayExist;
 146   virtual bool KeyMayExist(const ReadOptions& options,
 147                            ColumnFamilyHandle* column_family, const Slice& key,
 148                            std::string* value,
 149                            bool* value_found = nullptr) override;
 150
 151   using DB::NewIterator;
 152   virtual Iterator* NewIterator(const ReadOptions& options,
 153                                 ColumnFamilyHandle* column_family) override;
 154   virtual Status NewIterators(
 155       const ReadOptions& options,
 156       const std::vector<ColumnFamilyHandle*>& column_families,
 157       std::vector<Iterator*>* iterators) override;
 158   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
 159                                       ColumnFamilyData* cfd,
 160                                       SequenceNumber snapshot,
 161                                       ReadCallback* read_callback,
 162                                       bool allow_blob = false,
 163                                       bool allow_refresh = true);
 164
 165   virtual const Snapshot* GetSnapshot() override;
 166   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
 167   using DB::GetProperty;
 168   virtual bool GetProperty(ColumnFamilyHandle* column_family,
 169                            const Slice& property, std::string* value) override;
 170   using DB::GetMapProperty;
 171   virtual bool GetMapProperty(
 172       ColumnFamilyHandle* column_family, const Slice& property,
 173       std::map<std::string, std::string>* value) override;
 174   using DB::GetIntProperty;
 175   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
 176                               const Slice& property, uint64_t* value) override;
 177   using DB::GetAggregatedIntProperty;
 178   virtual bool GetAggregatedIntProperty(const Slice& property,
 179                                         uint64_t* aggregated_value) override;
 180   using DB::GetApproximateSizes;
 181   virtual void GetApproximateSizes(
 182       ColumnFamilyHandle* column_family, const Range* range, int n,
 183       uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override;
 184   using DB::GetApproximateMemTableStats;
 185   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
 186                                            const Range& range,
 187                                            uint64_t* const count,
 188                                            uint64_t* const size) override;
 189   using DB::CompactRange;
 190   virtual Status CompactRange(const CompactRangeOptions& options,
 191                               ColumnFamilyHandle* column_family,
 192                               const Slice* begin, const Slice* end) override;
 193
 194   using DB::CompactFiles;
 195   virtual Status CompactFiles(
 196       const CompactionOptions& compact_options,
 197       ColumnFamilyHandle* column_family,
 198       const std::vector<std::string>& input_file_names, const int output_level,
 199       const int output_path_id = -1,
 200       std::vector<std::string>* const output_file_names = nullptr,
 201       CompactionJobInfo* compaction_job_info = nullptr) override;
 202
 203   virtual Status PauseBackgroundWork() override;
 204   virtual Status ContinueBackgroundWork() override;
 205
 206   virtual Status EnableAutoCompaction(
 207       const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
 208
 209   using DB::SetOptions;
 210   Status SetOptions(
 211       ColumnFamilyHandle* column_family,
 212       const std::unordered_map<std::string, std::string>& options_map) override;
 213
 214   virtual Status SetDBOptions(
 215       const std::unordered_map<std::string, std::string>& options_map) override;
 216
 217   using DB::NumberLevels;
 218   virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
 219   using DB::MaxMemCompactionLevel;
 220   virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
 221   using DB::Level0StopWriteTrigger;
 222   virtual int Level0StopWriteTrigger(
 223       ColumnFamilyHandle* column_family) override;
 224   virtual const std::string& GetName() const override;
 225   virtual Env* GetEnv() const override;
 226   using DB::GetOptions;
 227   virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
 228   using DB::GetDBOptions;
 229   virtual DBOptions GetDBOptions() const override;
 230   using DB::Flush;
 231   virtual Status Flush(const FlushOptions& options,
 232                        ColumnFamilyHandle* column_family) override;
 233   virtual Status Flush(
 234       const FlushOptions& options,
 235       const std::vector<ColumnFamilyHandle*>& column_families) override;
 236   virtual Status FlushWAL(bool sync) override;
 237   bool TEST_WALBufferIsEmpty(bool lock = true);
 238   virtual Status SyncWAL() override;
 239   virtual Status LockWAL() override;
 240   virtual Status UnlockWAL() override;
 241
 242   virtual SequenceNumber GetLatestSequenceNumber() const override;
 243   virtual SequenceNumber GetLastPublishedSequence() const {
 244     if (last_seq_same_as_publish_seq_) {
 245       return versions_->LastSequence();
 246     } else {
 247       return versions_->LastPublishedSequence();
 248     }
 249   }
 250   // REQUIRES: joined the main write queue if two_write_queues is disabled, and
 251   // the second write queue otherwise.
 252   virtual void SetLastPublishedSequence(SequenceNumber seq);
 253   // Returns LastSequence in last_seq_same_as_publish_seq_
 254   // mode and LastAllocatedSequence otherwise. This is useful when visiblility
 255   // depends also on data written to the WAL but not to the memtable.
 256   SequenceNumber TEST_GetLastVisibleSequence() const;
 257
 258   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
 259
 260 #ifndef ROCKSDB_LITE
 261   using DB::ResetStats;
 262   virtual Status ResetStats() override;
 263   virtual Status DisableFileDeletions() override;
 264   virtual Status EnableFileDeletions(bool force) override;
 265   virtual int IsFileDeletionsEnabled() const;
 266   // All the returned filenames start with "/"
 267   virtual Status GetLiveFiles(std::vector<std::string>&,
 268                               uint64_t* manifest_file_size,
 269                               bool flush_memtable = true) override;
 270   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
 271
 272   virtual Status GetUpdatesSince(
 273       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
 274       const TransactionLogIterator::ReadOptions& read_options =
 275           TransactionLogIterator::ReadOptions()) override;
 276   virtual Status DeleteFile(std::string name) override;
 277   Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
 278                              const RangePtr* ranges, size_t n,
 279                              bool include_end = true);
 280
 281   virtual void GetLiveFilesMetaData(
 282       std::vector<LiveFileMetaData>* metadata) override;
 283
 284   // Obtains the meta data of the specified column family of the DB.
 285   // Status::NotFound() will be returned if the current DB does not have
 286   // any column family match the specified name.
 287   // TODO(yhchiang): output parameter is placed in the end in this codebase.
 288   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
 289                                        ColumnFamilyMetaData* metadata) override;
 290
 291   Status SuggestCompactRange(ColumnFamilyHandle* column_family,
 292                              const Slice* begin, const Slice* end) override;
 293
 294   Status PromoteL0(ColumnFamilyHandle* column_family,
 295                    int target_level) override;
 296
 297   // Similar to Write() but will call the callback once on the single write
 298   // thread to determine whether it is safe to perform the write.
 299   virtual Status WriteWithCallback(const WriteOptions& write_options,
 300                                    WriteBatch* my_batch,
 301                                    WriteCallback* callback);
 302
 303   // Returns the sequence number that is guaranteed to be smaller than or equal
 304   // to the sequence number of any key that could be inserted into the current
 305   // memtables. It can then be assumed that any write with a larger(or equal)
 306   // sequence number will be present in this memtable or a later memtable.
 307   //
 308   // If the earliest sequence number could not be determined,
 309   // kMaxSequenceNumber will be returned.
 310   //
 311   // If include_history=true, will also search Memtables in MemTableList
 312   // History.
 313   SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
 314                                                    bool include_history);
 315
 316   // For a given key, check to see if there are any records for this key
 317   // in the memtables, including memtable history.  If cache_only is false,
 318   // SST files will also be checked.
 319   //
 320   // If a key is found, *found_record_for_key will be set to true and
 321   // *seq will be set to the stored sequence number for the latest
 322   // operation on this key or kMaxSequenceNumber if unknown.
 323   // If no key is found, *found_record_for_key will be set to false.
 324   //
 325   // Note: If cache_only=false, it is possible for *seq to be set to 0 if
 326   // the sequence number has been cleared from the record.  If the caller is
 327   // holding an active db snapshot, we know the missing sequence must be less
 328   // than the snapshot's sequence number (sequence numbers are only cleared
 329   // when there are no earlier active snapshots).
 330   //
 331   // If NotFound is returned and found_record_for_key is set to false, then no
 332   // record for this key was found.  If the caller is holding an active db
 333   // snapshot, we know that no key could have existing after this snapshot
 334   // (since we do not compact keys that have an earlier snapshot).
 335   //
 336   // Returns OK or NotFound on success,
 337   // other status on unexpected error.
 338   // TODO(andrewkr): this API need to be aware of range deletion operations
 339   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
 340                                  bool cache_only, SequenceNumber* seq,
 341                                  bool* found_record_for_key,
 342                                  bool* is_blob_index = nullptr);
 343
 344   using DB::IngestExternalFile;
 345   virtual Status IngestExternalFile(
 346       ColumnFamilyHandle* column_family,
 347       const std::vector<std::string>& external_files,
 348       const IngestExternalFileOptions& ingestion_options) override;
 349
 350   using DB::IngestExternalFiles;
 351   virtual Status IngestExternalFiles(
 352       const std::vector<IngestExternalFileArg>& args) override;
 353
 354   virtual Status VerifyChecksum() override;
 355
 356   using DB::StartTrace;
 357   virtual Status StartTrace(
 358       const TraceOptions& options,
 359       std::unique_ptr<TraceWriter>&& trace_writer) override;
 360
 361   using DB::EndTrace;
 362   virtual Status EndTrace() override;
 363   Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
 364   Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
 365 #endif  // ROCKSDB_LITE
 366
 367   // Similar to GetSnapshot(), but also lets the db know that this snapshot
 368   // will be used for transaction write-conflict checking.  The DB can then
 369   // make sure not to compact any keys that would prevent a write-conflict from
 370   // being detected.
 371   const Snapshot* GetSnapshotForWriteConflictBoundary();
 372
 373   // checks if all live files exist on file system and that their file sizes
 374   // match to our in-memory records
 375   virtual Status CheckConsistency();
 376
 377   virtual Status GetDbIdentity(std::string& identity) const override;
 378
 379   Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 380                              int output_level, uint32_t output_path_id,
 381                              uint32_t max_subcompactions, const Slice* begin,
 382                              const Slice* end, bool exclusive,
 383                              bool disallow_trivial_move = false);
 384
 385   // Return an internal iterator over the current state of the database.
 386   // The keys of this iterator are internal keys (see format.h).
 387   // The returned iterator should be deleted when no longer needed.
 388   InternalIterator* NewInternalIterator(
 389       Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
 390       ColumnFamilyHandle* column_family = nullptr);
 391
 392   LogsWithPrepTracker* logs_with_prep_tracker() {
 393     return &logs_with_prep_tracker_;
 394   }
 395
 396 #ifndef NDEBUG
 397   // Extra methods (for testing) that are not in the public DB interface
 398   // Implemented in db_impl_debug.cc
 399
 400   // Compact any files in the named level that overlap [*begin, *end]
 401   Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
 402                            ColumnFamilyHandle* column_family = nullptr,
 403                            bool disallow_trivial_move = false);
 404
 405   void TEST_SwitchWAL();
 406
 407   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 408
 409   bool TEST_IsLogGettingFlushed() {
 410     return alive_log_files_.begin()->getting_flushed;
 411   }
 412
 413   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
 414
 415   // Force current memtable contents to be flushed.
 416   Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
 417                             ColumnFamilyHandle* cfh = nullptr);
 418
 419   // Wait for memtable compaction
 420   Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
 421
 422   // Wait for any compaction
 423   // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
 424   // is only for the special test of CancelledCompactions
 425   Status TEST_WaitForCompact(bool waitUnscheduled = false);
 426
 427   // Return the maximum overlapping data (in bytes) at next level for any
 428   // file at a level >= 1.
 429   int64_t TEST_MaxNextLevelOverlappingBytes(
 430       ColumnFamilyHandle* column_family = nullptr);
 431
 432   // Return the current manifest file no.
 433   uint64_t TEST_Current_Manifest_FileNo();
 434
 435   // Returns the number that'll be assigned to the next file that's created.
 436   uint64_t TEST_Current_Next_FileNo();
 437
 438   // get total level0 file size. Only for testing.
 439   uint64_t TEST_GetLevel0TotalSize();
 440
 441   void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
 442                              std::vector<std::vector<FileMetaData>>* metadata);
 443
 444   void TEST_LockMutex();
 445
 446   void TEST_UnlockMutex();
 447
 448   // REQUIRES: mutex locked
 449   void* TEST_BeginWrite();
 450
 451   // REQUIRES: mutex locked
 452   // pass the pointer that you got from TEST_BeginWrite()
 453   void TEST_EndWrite(void* w);
 454
 455   uint64_t TEST_MaxTotalInMemoryState() const {
 456     return max_total_in_memory_state_;
 457   }
 458
 459   size_t TEST_LogsToFreeSize();
 460
 461   uint64_t TEST_LogfileNumber();
 462
 463   uint64_t TEST_total_log_size() const { return total_log_size_; }
 464
 465   // Returns column family name to ImmutableCFOptions map.
 466   Status TEST_GetAllImmutableCFOptions(
 467       std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
 468
 469   // Return the lastest MutableCFOptions of a column family
 470   Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
 471                                         MutableCFOptions* mutable_cf_options);
 472
 473   Cache* TEST_table_cache() { return table_cache_.get(); }
 474
 475   WriteController& TEST_write_controler() { return write_controller_; }
 476
 477   uint64_t TEST_FindMinLogContainingOutstandingPrep();
 478   uint64_t TEST_FindMinPrepLogReferencedByMemTable();
 479   size_t TEST_PreparedSectionCompletedSize();
 480   size_t TEST_LogsWithPrepSize();
 481
 482   int TEST_BGCompactionsAllowed() const;
 483   int TEST_BGFlushesAllowed() const;
 484   size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
 485   void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
 486   void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
 487   bool TEST_IsPersistentStatsEnabled() const;
 488   size_t TEST_EstiamteStatsHistorySize() const;
 489
 490 #endif  // NDEBUG
 491
 492   struct BGJobLimits {
 493     int max_flushes;
 494     int max_compactions;
 495   };
 496   // Returns maximum background flushes and compactions allowed to be scheduled
 497   BGJobLimits GetBGJobLimits() const;
 498   // Need a static version that can be called during SanitizeOptions().
 499   static BGJobLimits GetBGJobLimits(int max_background_flushes,
 500                                     int max_background_compactions,
 501                                     int max_background_jobs,
 502                                     bool parallelize_compactions);
 503
 504   // move logs pending closing from job_context to the DB queue and
 505   // schedule a purge
 506   void ScheduleBgLogWriterClose(JobContext* job_context);
 507
 508   uint64_t MinLogNumberToKeep();
 509
 510   // Returns the lower bound file number for SSTs that won't be deleted, even if
 511   // they're obsolete. This lower bound is used internally to prevent newly
 512   // created flush/compaction output files from being deleted before they're
 513   // installed. This technique avoids the need for tracking the exact numbers of
 514   // files pending creation, although it prevents more files than necessary from
 515   // being deleted.
 516   uint64_t MinObsoleteSstNumberToKeep();
 517
 518   // Returns the list of live files in 'live' and the list
 519   // of all files in the filesystem in 'candidate_files'.
 520   // If force == false and the last call was less than
 521   // db_options_.delete_obsolete_files_period_micros microseconds ago,
 522   // it will not fill up the job_context
 523   void FindObsoleteFiles(JobContext* job_context, bool force,
 524                          bool no_full_scan = false);
 525
 526   // Diffs the files listed in filenames and those that do not
 527   // belong to live files are possibly removed. Also, removes all the
 528   // files in sst_delete_files and log_delete_files.
 529   // It is not necessary to hold the mutex when invoking this method.
 530   // If FindObsoleteFiles() was run, we need to also run
 531   // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
 532   void PurgeObsoleteFiles(JobContext& background_contet,
 533                           bool schedule_only = false);
 534
 535   void SchedulePurge();
 536
 537   ColumnFamilyHandle* DefaultColumnFamily() const override;
 538
 539   const SnapshotList& snapshots() const { return snapshots_; }
 540
 541   const ImmutableDBOptions& immutable_db_options() const {
 542     return immutable_db_options_;
 543   }
 544
 545   void CancelAllBackgroundWork(bool wait);
 546
 547   // Find Super version and reference it. Based on options, it might return
 548   // the thread local cached one.
 549   // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
 550   SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
 551
 552   // Similar to the previous function but looks up based on a column family id.
 553   // nullptr will be returned if this column family no longer exists.
 554   // REQUIRED: this function should only be called on the write thread or if the
 555   // mutex is held.
 556   SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
 557
 558   // Un-reference the super version and clean it up if it is the last reference.
 559   void CleanupSuperVersion(SuperVersion* sv);
 560
 561   // Un-reference the super version and return it to thread local cache if
 562   // needed. If it is the last reference of the super version. Clean it up
 563   // after un-referencing it.
 564   void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
 565
 566   // Similar to the previous function but looks up based on a column family id.
 567   // nullptr will be returned if this column family no longer exists.
 568   // REQUIRED: this function should only be called on the write thread.
 569   void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
 570
 571   // REQUIRED: this function should only be called on the write thread or if the
 572   // mutex is held.  Return value only valid until next call to this function or
 573   // mutex is released.
 574   ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
 575
 576   // Same as above, should called without mutex held and not on write thread.
 577   std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
 578       uint32_t column_family_id);
 579
 580   // Returns the number of currently running flushes.
 581   // REQUIREMENT: mutex_ must be held when calling this function.
 582   int num_running_flushes() {
 583     mutex_.AssertHeld();
 584     return num_running_flushes_;
 585   }
 586
 587   // Returns the number of currently running compactions.
 588   // REQUIREMENT: mutex_ must be held when calling this function.
 589   int num_running_compactions() {
 590     mutex_.AssertHeld();
 591     return num_running_compactions_;
 592   }
 593
 594   const WriteController& write_controller() { return write_controller_; }
 595
 596   InternalIterator* NewInternalIterator(
 597       const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
 598       Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
 599
 600   // hollow transactions shell used for recovery.
 601   // these will then be passed to TransactionDB so that
 602   // locks can be reacquired before writing can resume.
 603   struct RecoveredTransaction {
 604     std::string name_;
 605     bool unprepared_;
 606
 607     struct BatchInfo {
 608       uint64_t log_number_;
 609       // TODO(lth): For unprepared, the memory usage here can be big for
 610       // unprepared transactions. This is only useful for rollbacks, and we
 611       // can in theory just keep keyset for that.
 612       WriteBatch* batch_;
 613       // Number of sub-batches. A new sub-batch is created if txn attempts to
 614       // insert a duplicate key,seq to memtable. This is currently used in
 615       // WritePreparedTxn/WriteUnpreparedTxn.
 616       size_t batch_cnt_;
 617     };
 618
 619     // This maps the seq of the first key in the batch to BatchInfo, which
 620     // contains WriteBatch and other information relevant to the batch.
 621     //
 622     // For WriteUnprepared, batches_ can have size greater than 1, but for
 623     // other write policies, it must be of size 1.
 624     std::map<SequenceNumber, BatchInfo> batches_;
 625
 626     explicit RecoveredTransaction(const uint64_t log, const std::string& name,
 627                                   WriteBatch* batch, SequenceNumber seq,
 628                                   size_t batch_cnt, bool unprepared)
 629         : name_(name), unprepared_(unprepared) {
 630       batches_[seq] = {log, batch, batch_cnt};
 631     }
 632
 633     ~RecoveredTransaction() {
 634       for (auto& it : batches_) {
 635         delete it.second.batch_;
 636       }
 637     }
 638
 639     void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
 640                   size_t batch_cnt, bool unprepared) {
 641       assert(batches_.count(seq) == 0);
 642       batches_[seq] = {log_number, batch, batch_cnt};
 643       // Prior state must be unprepared, since the prepare batch must be the
 644       // last batch.
 645       assert(unprepared_);
 646       unprepared_ = unprepared;
 647     }
 648   };
 649
 650   bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
 651
 652   std::unordered_map<std::string, RecoveredTransaction*>
 653   recovered_transactions() {
 654     return recovered_transactions_;
 655   }
 656
 657   RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
 658     auto it = recovered_transactions_.find(name);
 659     if (it == recovered_transactions_.end()) {
 660       return nullptr;
 661     } else {
 662       return it->second;
 663     }
 664   }
 665
 666   void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
 667                                   WriteBatch* batch, SequenceNumber seq,
 668                                   size_t batch_cnt, bool unprepared_batch) {
 669     // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
 670     // times for every unprepared batch encountered during recovery.
 671     //
 672     // If the transaction is prepared, then the last call to
 673     // InsertRecoveredTransaction will have unprepared_batch = false.
 674     auto rtxn = recovered_transactions_.find(name);
 675     if (rtxn == recovered_transactions_.end()) {
 676       recovered_transactions_[name] = new RecoveredTransaction(
 677           log, name, batch, seq, batch_cnt, unprepared_batch);
 678     } else {
 679       rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
 680     }
 681     logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
 682   }
 683
 684   void DeleteRecoveredTransaction(const std::string& name) {
 685     auto it = recovered_transactions_.find(name);
 686     assert(it != recovered_transactions_.end());
 687     auto* trx = it->second;
 688     recovered_transactions_.erase(it);
 689     for (const auto& info : trx->batches_) {
 690       logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
 691           info.second.log_number_);
 692     }
 693     delete trx;
 694   }
 695
 696   void DeleteAllRecoveredTransactions() {
 697     for (auto it = recovered_transactions_.begin();
 698          it != recovered_transactions_.end(); it++) {
 699       delete it->second;
 700     }
 701     recovered_transactions_.clear();
 702   }
 703
 704   void AddToLogsToFreeQueue(log::Writer* log_writer) {
 705     logs_to_free_queue_.push_back(log_writer);
 706   }
 707
 708   void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
 709
 710   // Fill JobContext with snapshot information needed by flush and compaction.
 711   void GetSnapshotContext(JobContext* job_context,
 712                           std::vector<SequenceNumber>* snapshot_seqs,
 713                           SequenceNumber* earliest_write_conflict_snapshot,
 714                           SnapshotChecker** snapshot_checker);
 715
 716   // Not thread-safe.
 717   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
 718
 719   InstrumentedMutex* mutex() { return &mutex_; }
 720
 721   Status NewDB();
 722
 723   // This is to be used only by internal rocksdb classes.
 724   static Status Open(const DBOptions& db_options, const std::string& name,
 725                      const std::vector<ColumnFamilyDescriptor>& column_families,
 726                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
 727                      const bool seq_per_batch, const bool batch_per_txn);
 728
 729   virtual Status Close() override;
 730
 731   static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
 732                                       std::unique_ptr<Directory>* directory);
 733
 734   // Given a time window, return an iterator for accessing stats history
 735   Status GetStatsHistory(
 736       uint64_t start_time, uint64_t end_time,
 737       std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
 738
 739   // find stats map from stats_history_ with smallest timestamp in
 740   // the range of [start_time, end_time)
 741   bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
 742                        uint64_t* new_time,
 743                        std::map<std::string, uint64_t>* stats_map);
 744
 745  protected:
 746   Env* const env_;
 747   const std::string dbname_;
 748   std::unique_ptr<VersionSet> versions_;
 749   // Flag to check whether we allocated and own the info log file
 750   bool own_info_log_;
 751   const DBOptions initial_db_options_;
 752   const ImmutableDBOptions immutable_db_options_;
 753   MutableDBOptions mutable_db_options_;
 754   Statistics* stats_;
 755   std::unordered_map<std::string, RecoveredTransaction*>
 756       recovered_transactions_;
 757   std::unique_ptr<Tracer> tracer_;
 758   InstrumentedMutex trace_mutex_;
 759
 760   // State below is protected by mutex_
 761   // With two_write_queues enabled, some of the variables that accessed during
 762   // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
 763   // logs_, logfile_number_. Refer to the definition of each variable below for
 764   // more description.
 765   mutable InstrumentedMutex mutex_;
 766
 767   ColumnFamilyHandleImpl* default_cf_handle_;
 768   InternalStats* default_cf_internal_stats_;
 769
 770   // only used for dynamically adjusting max_total_wal_size. it is a sum of
 771   // [write_buffer_size * max_write_buffer_number] over all column families
 772   uint64_t max_total_in_memory_state_;
 773   // If true, we have only one (default) column family. We use this to optimize
 774   // some code-paths
 775   bool single_column_family_mode_;
 776
 777   // The options to access storage files
 778   const EnvOptions env_options_;
 779
 780   // Additonal options for compaction and flush
 781   EnvOptions env_options_for_compaction_;
 782
 783   // Except in DB::Open(), WriteOptionsFile can only be called when:
 784   // Persist options to options file.
 785   // If need_mutex_lock = false, the method will lock DB mutex.
 786   // If need_enter_write_thread = false, the method will enter write thread.
 787   Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
 788
 789   // The following two functions can only be called when:
 790   // 1. WriteThread::Writer::EnterUnbatched() is used.
 791   // 2. db_mutex is NOT held
 792   Status RenameTempFileToOptionsFile(const std::string& file_name);
 793   Status DeleteObsoleteOptionsFiles();
 794
 795   void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
 796                           const MutableCFOptions& mutable_cf_options,
 797                           int job_id, TableProperties prop);
 798
 799   void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
 800                               const MutableCFOptions& mutable_cf_options,
 801                               int job_id, TableProperties prop);
 802
 803   void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
 804                                const Status& st,
 805                                const CompactionJobStats& job_stats, int job_id);
 806
 807   void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
 808                                    const Status& st,
 809                                    const CompactionJobStats& job_stats,
 810                                    int job_id);
 811   void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
 812                               const MemTableInfo& mem_table_info);
 813
 814 #ifndef ROCKSDB_LITE
 815   void NotifyOnExternalFileIngested(
 816       ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
 817 #endif  // !ROCKSDB_LITE
 818
 819   void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
 820
 821   void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
 822
 823   void EraseThreadStatusDbInfo() const;
 824
 825   // If disable_memtable is set the application logic must guarantee that the
 826   // batch will still be skipped from memtable during the recovery. An excption
 827   // to this is seq_per_batch_ mode, in which since each batch already takes one
 828   // seq, it is ok for the batch to write to memtable during recovery as long as
 829   // it only takes one sequence number: i.e., no duplicate keys.
 830   // In WriteCommitted it is guarnateed since disable_memtable is used for
 831   // prepare batch which will be written to memtable later during the commit,
 832   // and in WritePrepared it is guaranteed since it will be used only for WAL
 833   // markers which will never be written to memtable. If the commit marker is
 834   // accompanied with CommitTimeWriteBatch that is not written to memtable as
 835   // long as it has no duplicate keys, it does not violate the one-seq-per-batch
 836   // policy.
 837   // batch_cnt is expected to be non-zero in seq_per_batch mode and
 838   // indicates the number of sub-patches. A sub-patch is a subset of the write
 839   // batch that does not have duplicate keys.
 840   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
 841                    WriteCallback* callback = nullptr,
 842                    uint64_t* log_used = nullptr, uint64_t log_ref = 0,
 843                    bool disable_memtable = false, uint64_t* seq_used = nullptr,
 844                    size_t batch_cnt = 0,
 845                    PreReleaseCallback* pre_release_callback = nullptr);
 846
 847   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
 848                             WriteCallback* callback = nullptr,
 849                             uint64_t* log_used = nullptr, uint64_t log_ref = 0,
 850                             bool disable_memtable = false,
 851                             uint64_t* seq_used = nullptr);
 852
 853   // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
 854   // the number of sub-patches. A sub-patch is a subset of the write batch that
 855   // does not have duplicate keys.
 856   Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
 857                           WriteCallback* callback = nullptr,
 858                           uint64_t* log_used = nullptr, uint64_t log_ref = 0,
 859                           uint64_t* seq_used = nullptr, size_t batch_cnt = 0,
 860                           PreReleaseCallback* pre_release_callback = nullptr);
 861
 862   // write cached_recoverable_state_ to memtable if it is not empty
 863   // The writer must be the leader in write_thread_ and holding mutex_
 864   Status WriteRecoverableState();
 865
 866   // Actual implementation of Close()
 867   Status CloseImpl();
 868
 869   // Recover the descriptor from persistent storage.  May do a significant
 870   // amount of work to recover recently logged updates.  Any changes to
 871   // be made to the descriptor are added to *edit.
 872   virtual Status Recover(
 873       const std::vector<ColumnFamilyDescriptor>& column_families,
 874       bool read_only = false, bool error_if_log_file_exist = false,
 875       bool error_if_data_exists_in_logs = false);
 876
 877  private:
 878   friend class DB;
 879   friend class ErrorHandler;
 880   friend class InternalStats;
 881   friend class PessimisticTransaction;
 882   friend class TransactionBaseImpl;
 883   friend class WriteCommittedTxn;
 884   friend class WritePreparedTxn;
 885   friend class WritePreparedTxnDB;
 886   friend class WriteBatchWithIndex;
 887   friend class WriteUnpreparedTxnDB;
 888   friend class WriteUnpreparedTxn;
 889
 890 #ifndef ROCKSDB_LITE
 891   friend class ForwardIterator;
 892 #endif
 893   friend struct SuperVersion;
 894   friend class CompactedDBImpl;
 895   friend class DBTest_ConcurrentFlushWAL_Test;
 896   friend class DBTest_MixedSlowdownOptionsStop_Test;
 897   friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
 898 #ifndef NDEBUG
 899   friend class DBTest2_ReadCallbackTest_Test;
 900   friend class WriteCallbackTest_WriteWithCallbackTest_Test;
 901   friend class XFTransactionWriteHandler;
 902   friend class DBBlobIndexTest;
 903   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 904 #endif
 905   struct CompactionState;
 906
 907   struct WriteContext {
 908     SuperVersionContext superversion_context;
 909     autovector<MemTable*> memtables_to_free_;
 910
 911     explicit WriteContext(bool create_superversion = false)
 912         : superversion_context(create_superversion) {}
 913
 914     ~WriteContext() {
 915       superversion_context.Clean();
 916       for (auto& m : memtables_to_free_) {
 917         delete m;
 918       }
 919     }
 920   };
 921
 922   struct PrepickedCompaction;
 923   struct PurgeFileInfo;
 924
 925   Status ResumeImpl();
 926
 927   void MaybeIgnoreError(Status* s) const;
 928
 929   const Status CreateArchivalDirectory();
 930
 931   Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
 932                                 const std::string& cf_name,
 933                                 ColumnFamilyHandle** handle);
 934
 935   Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
 936
 937   // Delete any unneeded files and stale in-memory entries.
 938   void DeleteObsoleteFiles();
 939   // Delete obsolete files and log status and information of file deletion
 940   void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
 941                               const std::string& path_to_sync, FileType type,
 942                               uint64_t number);
 943
 944   // Background process needs to call
 945   //     auto x = CaptureCurrentFileNumberInPendingOutputs()
 946   //     auto file_num = versions_->NewFileNumber();
 947   //     <do something>
 948   //     ReleaseFileNumberFromPendingOutputs(x)
 949   // This will protect any file with number `file_num` or greater from being
 950   // deleted while <do something> is running.
 951   // -----------
 952   // This function will capture current file number and append it to
 953   // pending_outputs_. This will prevent any background process to delete any
 954   // file created after this point.
 955   std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
 956   // This function should be called with the result of
 957   // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
 958   // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
 959   // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
 960   // and blocked by any other pending_outputs_ calls)
 961   void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
 962
 963   Status SyncClosedLogs(JobContext* job_context);
 964
 965   // Flush the in-memory write buffer to storage.  Switches to a new
 966   // log-file/memtable and writes a new descriptor iff successful. Then
 967   // installs a new super version for the column family.
 968   Status FlushMemTableToOutputFile(
 969       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
 970       bool* madeProgress, JobContext* job_context,
 971       SuperVersionContext* superversion_context,
 972       std::vector<SequenceNumber>& snapshot_seqs,
 973       SequenceNumber earliest_write_conflict_snapshot,
 974       SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
 975       Env::Priority thread_pri);
 976
 977   // Argument required by background flush thread.
 978   struct BGFlushArg {
 979     BGFlushArg()
 980         : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
 981     BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
 982                SuperVersionContext* superversion_context)
 983         : cfd_(cfd),
 984           max_memtable_id_(max_memtable_id),
 985           superversion_context_(superversion_context) {}
 986
 987     // Column family to flush.
 988     ColumnFamilyData* cfd_;
 989     // Maximum ID of memtable to flush. In this column family, memtables with
 990     // IDs smaller than this value must be flushed before this flush completes.
 991     uint64_t max_memtable_id_;
 992     // Pointer to a SuperVersionContext object. After flush completes, RocksDB
 993     // installs a new superversion for the column family. This operation
 994     // requires a SuperVersionContext object (currently embedded in JobContext).
 995     SuperVersionContext* superversion_context_;
 996   };
 997
 998   // Argument passed to flush thread.
 999   struct FlushThreadArg {
1000     DBImpl* db_;
1001
1002     Env::Priority thread_pri_;
1003   };
1004
1005   // Flush the memtables of (multiple) column families to multiple files on
1006   // persistent storage.
1007   Status FlushMemTablesToOutputFiles(
1008       const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
1009       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
1010
1011   Status AtomicFlushMemTablesToOutputFiles(
1012       const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
1013       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
1014
1015   // REQUIRES: log_numbers are sorted in ascending order
1016   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
1017                          SequenceNumber* next_sequence, bool read_only);
1018
1019   // The following two methods are used to flush a memtable to
1020   // storage. The first one is used at database RecoveryTime (when the
1021   // database is opened) and is heavyweight because it holds the mutex
1022   // for the entire period. The second method WriteLevel0Table supports
1023   // concurrent flush memtables to storage.
1024   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1025                                      MemTable* mem, VersionEdit* edit);
1026
1027   // Restore alive_log_files_ and total_log_size_ after recovery.
1028   // It needs to run only when there's no flush during recovery
1029   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
1030   // in case total_log_size > max_total_wal_size.
1031   Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
1032
1033   // num_bytes: for slowdown case, delay time is calculated based on
1034   //            `num_bytes` going through.
1035   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
1036
1037   Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
1038                                       WriteBatch* my_batch);
1039
1040   Status ScheduleFlushes(WriteContext* context);
1041
1042   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
1043
1044   void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
1045
1046   // Force current memtable contents to be flushed.
1047   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
1048                        FlushReason flush_reason, bool writes_stopped = false);
1049
1050   Status AtomicFlushMemTables(
1051       const autovector<ColumnFamilyData*>& column_family_datas,
1052       const FlushOptions& options, FlushReason flush_reason,
1053       bool writes_stopped = false);
1054
1055   // Wait until flushing this column family won't stall writes
1056   Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
1057                                            bool* flush_needed);
1058
1059   // Wait for memtable flushed.
1060   // If flush_memtable_id is non-null, wait until the memtable with the ID
1061   // gets flush. Otherwise, wait until the column family don't have any
1062   // memtable pending flush.
1063   // resuming_from_bg_err indicates whether the caller is attempting to resume
1064   // from background error.
1065   Status WaitForFlushMemTable(ColumnFamilyData* cfd,
1066                               const uint64_t* flush_memtable_id = nullptr,
1067                               bool resuming_from_bg_err = false) {
1068     return WaitForFlushMemTables({cfd}, {flush_memtable_id},
1069                                  resuming_from_bg_err);
1070   }
1071   // Wait for memtables to be flushed for multiple column families.
1072   Status WaitForFlushMemTables(
1073       const autovector<ColumnFamilyData*>& cfds,
1074       const autovector<const uint64_t*>& flush_memtable_ids,
1075       bool resuming_from_bg_err);
1076
1077   // REQUIRES: mutex locked and in write thread.
1078   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
1079
1080   // REQUIRES: mutex locked
1081   Status SwitchWAL(WriteContext* write_context);
1082
1083   // REQUIRES: mutex locked
1084   Status HandleWriteBufferFull(WriteContext* write_context);
1085
1086   // REQUIRES: mutex locked
1087   Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
1088                          WriteContext* write_context);
1089
1090   WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group,
1091                          WriteBatch* tmp_batch, size_t* write_with_wal,
1092                          WriteBatch** to_be_cached_state);
1093
1094   Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
1095                     uint64_t* log_used, uint64_t* log_size);
1096
1097   Status WriteToWAL(const WriteThread::WriteGroup& write_group,
1098                     log::Writer* log_writer, uint64_t* log_used,
1099                     bool need_log_sync, bool need_log_dir_sync,
1100                     SequenceNumber sequence);
1101
1102   Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
1103                               uint64_t* log_used, SequenceNumber* last_sequence,
1104                               size_t seq_inc);
1105
1106   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
1107   void WriteStatusCheck(const Status& status);
1108
1109   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
1110   void MemTableInsertStatusCheck(const Status& memtable_insert_status);
1111
1112 #ifndef ROCKSDB_LITE
1113
1114   Status CompactFilesImpl(const CompactionOptions& compact_options,
1115                           ColumnFamilyData* cfd, Version* version,
1116                           const std::vector<std::string>& input_file_names,
1117                           std::vector<std::string>* const output_file_names,
1118                           const int output_level, int output_path_id,
1119                           JobContext* job_context, LogBuffer* log_buffer,
1120                           CompactionJobInfo* compaction_job_info);
1121
1122   // Wait for current IngestExternalFile() calls to finish.
1123   // REQUIRES: mutex_ held
1124   void WaitForIngestFile();
1125
1126 #else
1127   // IngestExternalFile is not supported in ROCKSDB_LITE so this function
1128   // will be no-op
1129   void WaitForIngestFile() {}
1130 #endif  // ROCKSDB_LITE
1131
1132   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
1133
1134   void MaybeScheduleFlushOrCompaction();
1135
1136   // A flush request specifies the column families to flush as well as the
1137   // largest memtable id to persist for each column family. Once all the
1138   // memtables whose IDs are smaller than or equal to this per-column-family
1139   // specified value, this flush request is considered to have completed its
1140   // work of flushing this column family. After completing the work for all
1141   // column families in this request, this flush is considered complete.
1142   typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
1143
1144   void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
1145                             FlushRequest* req);
1146
1147   void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
1148
1149   void SchedulePendingCompaction(ColumnFamilyData* cfd);
1150   void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
1151                             FileType type, uint64_t number, int job_id);
1152   static void BGWorkCompaction(void* arg);
1153   // Runs a pre-chosen universal compaction involving bottom level in a
1154   // separate, bottom-pri thread pool.
1155   static void BGWorkBottomCompaction(void* arg);
1156   static void BGWorkFlush(void* arg);
1157   static void BGWorkPurge(void* arg);
1158   static void UnscheduleCompactionCallback(void* arg);
1159   static void UnscheduleFlushCallback(void* arg);
1160   void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
1161                                 Env::Priority thread_pri);
1162   void BackgroundCallFlush(Env::Priority thread_pri);
1163   void BackgroundCallPurge();
1164   Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
1165                               LogBuffer* log_buffer,
1166                               PrepickedCompaction* prepicked_compaction,
1167                               Env::Priority thread_pri);
1168   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
1169                          LogBuffer* log_buffer, FlushReason* reason,
1170                          Env::Priority thread_pri);
1171
1172   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
1173                                const std::vector<CompactionInputFiles>& inputs,
1174                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
1175
1176   // Request compaction tasks token from compaction thread limiter.
1177   // It always succeeds if force = true or limiter is disable.
1178   bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
1179                               std::unique_ptr<TaskLimiterToken>* token,
1180                               LogBuffer* log_buffer);
1181
1182   // Schedule background tasks
1183   void StartTimedTasks();
1184
1185   void PrintStatistics();
1186
1187   size_t EstiamteStatsHistorySize() const;
1188
1189   // persist stats to column family "_persistent_stats"
1190   void PersistStats();
1191
1192   // dump rocksdb.stats to LOG
1193   void DumpStats();
1194
1195   // Return the minimum empty level that could hold the total data in the
1196   // input level. Return the input level, if such level could not be found.
1197   int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
1198                                    const MutableCFOptions& mutable_cf_options,
1199                                    int level);
1200
1201   // Move the files in the input level to the target level.
1202   // If target_level < 0, automatically calculate the minimum level that could
1203   // hold the data set.
1204   Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
1205
1206   // helper functions for adding and removing from flush & compaction queues
1207   void AddToCompactionQueue(ColumnFamilyData* cfd);
1208   ColumnFamilyData* PopFirstFromCompactionQueue();
1209   FlushRequest PopFirstFromFlushQueue();
1210
1211   // Pick the first unthrottled compaction with task token from queue.
1212   ColumnFamilyData* PickCompactionFromQueue(
1213       std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
1214
1215   // helper function to call after some of the logs_ were synced
1216   void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
1217
1218   SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
1219                                 bool lock = true);
1220
1221   uint64_t GetMaxTotalWalSize() const;
1222
1223   Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
1224
1225   Status CloseHelper();
1226
1227   void WaitForBackgroundWork();
1228
1229   // table_cache_ provides its own synchronization
1230   std::shared_ptr<Cache> table_cache_;
1231
1232   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
1233   FileLock* db_lock_;
1234
1235   // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
1236   InstrumentedMutex stats_history_mutex_;
1237   // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
1238   // logfile_number_. With two_write_queues it also protects alive_log_files_,
1239   // and log_empty_. Refer to the definition of each variable below for more
1240   // details.
1241   InstrumentedMutex log_write_mutex_;
1242
1243   std::atomic<bool> shutting_down_;
1244   // This condition variable is signaled on these conditions:
1245   // * whenever bg_compaction_scheduled_ goes down to 0
1246   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
1247   // made any progress
1248   // * whenever a compaction made any progress
1249   // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
1250   // (i.e. whenever a flush is done, even if it didn't make any progress)
1251   // * whenever there is an error in background purge, flush or compaction
1252   // * whenever num_running_ingest_file_ goes to 0.
1253   // * whenever pending_purge_obsolete_files_ goes to 0.
1254   // * whenever disable_delete_obsolete_files_ goes to 0.
1255   // * whenever SetOptions successfully updates options.
1256   // * whenever a column family is dropped.
1257   InstrumentedCondVar bg_cv_;
1258   // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
1259   // must be under either mutex_ or log_write_mutex_. Since after ::Open,
1260   // logfile_number_ is currently updated only in write_thread_, it can be read
1261   // from the same write_thread_ without any locks.
1262   uint64_t logfile_number_;
1263   std::deque<uint64_t>
1264       log_recycle_files_;  // a list of log files that we can recycle
1265   bool log_dir_synced_;
1266   // Without two_write_queues, read and writes to log_empty_ are protected by
1267   // mutex_. Since it is currently updated/read only in write_thread_, it can be
1268   // accessed from the same write_thread_ without any locks. With
1269   // two_write_queues writes, where it can be updated in different threads,
1270   // read and writes are protected by log_write_mutex_ instead. This is to avoid
1271   // expesnive mutex_ lock during WAL write, which update log_empty_.
1272   bool log_empty_;
1273
1274   std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
1275   struct LogFileNumberSize {
1276     explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
1277     void AddSize(uint64_t new_size) { size += new_size; }
1278     uint64_t number;
1279     uint64_t size = 0;
1280     bool getting_flushed = false;
1281   };
1282   struct LogWriterNumber {
1283     // pass ownership of _writer
1284     LogWriterNumber(uint64_t _number, log::Writer* _writer)
1285         : number(_number), writer(_writer) {}
1286
1287     log::Writer* ReleaseWriter() {
1288       auto* w = writer;
1289       writer = nullptr;
1290       return w;
1291     }
1292     Status ClearWriter() {
1293       Status s = writer->WriteBuffer();
1294       delete writer;
1295       writer = nullptr;
1296       return s;
1297     }
1298
1299     uint64_t number;
1300     // Visual Studio doesn't support deque's member to be noncopyable because
1301     // of a std::unique_ptr as a member.
1302     log::Writer* writer;  // own
1303     // true for some prefix of logs_
1304     bool getting_synced = false;
1305   };
1306   // Without two_write_queues, read and writes to alive_log_files_ are
1307   // protected by mutex_. However since back() is never popped, and push_back()
1308   // is done only from write_thread_, the same thread can access the item
1309   // reffered by back() without mutex_. With two_write_queues_, writes
1310   // are protected by locking both mutex_ and log_write_mutex_, and reads must
1311   // be under either mutex_ or log_write_mutex_.
1312   std::deque<LogFileNumberSize> alive_log_files_;
1313   // Log files that aren't fully synced, and the current log file.
1314   // Synchronization:
1315   //  - push_back() is done from write_thread_ with locked mutex_ and
1316   //  log_write_mutex_
1317   //  - pop_front() is done from any thread with locked mutex_ and
1318   //  log_write_mutex_
1319   //  - reads are done with either locked mutex_ or log_write_mutex_
1320   //  - back() and items with getting_synced=true are not popped,
1321   //  - The same thread that sets getting_synced=true will reset it.
1322   //  - it follows that the object referred by back() can be safely read from
1323   //  the write_thread_ without using mutex
1324   //  - it follows that the items with getting_synced=true can be safely read
1325   //  from the same thread that has set getting_synced=true
1326   std::deque<LogWriterNumber> logs_;
1327   // Signaled when getting_synced becomes false for some of the logs_.
1328   InstrumentedCondVar log_sync_cv_;
1329   // This is the app-level state that is written to the WAL but will be used
1330   // only during recovery. Using this feature enables not writing the state to
1331   // memtable on normal writes and hence improving the throughput. Each new
1332   // write of the state will replace the previous state entirely even if the
1333   // keys in the two consecuitive states do not overlap.
1334   // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
1335   // Otherwise only the heaad of write_thread_ can access it.
1336   WriteBatch cached_recoverable_state_;
1337   std::atomic<bool> cached_recoverable_state_empty_ = {true};
1338   std::atomic<uint64_t> total_log_size_;
1339
1340   // If this is non-empty, we need to delete these log files in background
1341   // threads. Protected by db mutex.
1342   autovector<log::Writer*> logs_to_free_;
1343
1344   bool is_snapshot_supported_;
1345
1346   std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
1347
1348   std::map<std::string, uint64_t> stats_slice_;
1349
1350   bool stats_slice_initialized_ = false;
1351
1352   // Class to maintain directories for all database paths other than main one.
1353   class Directories {
1354    public:
1355     Status SetDirectories(Env* env, const std::string& dbname,
1356                           const std::string& wal_dir,
1357                           const std::vector<DbPath>& data_paths);
1358
1359     Directory* GetDataDir(size_t path_id) const;
1360
1361     Directory* GetWalDir() {
1362       if (wal_dir_) {
1363         return wal_dir_.get();
1364       }
1365       return db_dir_.get();
1366     }
1367
1368     Directory* GetDbDir() { return db_dir_.get(); }
1369
1370    private:
1371     std::unique_ptr<Directory> db_dir_;
1372     std::vector<std::unique_ptr<Directory>> data_dirs_;
1373     std::unique_ptr<Directory> wal_dir_;
1374   };
1375
1376   Directories directories_;
1377
1378   WriteBufferManager* write_buffer_manager_;
1379
1380   WriteThread write_thread_;
1381   WriteBatch tmp_batch_;
1382   // The write thread when the writers have no memtable write. This will be used
1383   // in 2PC to batch the prepares separately from the serial commit.
1384   WriteThread nonmem_write_thread_;
1385
1386   WriteController write_controller_;
1387
1388   std::unique_ptr<RateLimiter> low_pri_write_rate_limiter_;
1389
1390   // Size of the last batch group. In slowdown mode, next write needs to
1391   // sleep if it uses up the quota.
1392   // Note: This is to protect memtable and compaction. If the batch only writes
1393   // to the WAL its size need not to be included in this.
1394   uint64_t last_batch_group_size_;
1395
1396   FlushScheduler flush_scheduler_;
1397
1398   SnapshotList snapshots_;
1399
1400   // For each background job, pending_outputs_ keeps the current file number at
1401   // the time that background job started.
1402   // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
1403   // number bigger than any of the file number in pending_outputs_. Since file
1404   // numbers grow monotonically, this also means that pending_outputs_ is always
1405   // sorted. After a background job is done executing, its file number is
1406   // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
1407   // it up.
1408   // State is protected with db mutex.
1409   std::list<uint64_t> pending_outputs_;
1410
1411   // PurgeFileInfo is a structure to hold information of files to be deleted in
1412   // purge_queue_
1413   struct PurgeFileInfo {
1414     std::string fname;
1415     std::string dir_to_sync;
1416     FileType type;
1417     uint64_t number;
1418     int job_id;
1419     PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
1420                   int jid)
1421         : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
1422   };
1423
1424   // flush_queue_ and compaction_queue_ hold column families that we need to
1425   // flush and compact, respectively.
1426   // A column family is inserted into flush_queue_ when it satisfies condition
1427   // cfd->imm()->IsFlushPending()
1428   // A column family is inserted into compaction_queue_ when it satisfied
1429   // condition cfd->NeedsCompaction()
1430   // Column families in this list are all Ref()-erenced
1431   // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
1432   // do RAII on ColumnFamilyData
1433   // Column families are in this queue when they need to be flushed or
1434   // compacted. Consumers of these queues are flush and compaction threads. When
1435   // column family is put on this queue, we increase unscheduled_flushes_ and
1436   // unscheduled_compactions_. When these variables are bigger than zero, that
1437   // means we need to schedule background threads for flush and compaction.
1438   // Once the background threads are scheduled, we decrease unscheduled_flushes_
1439   // and unscheduled_compactions_. That way we keep track of number of
1440   // compaction and flush threads we need to schedule. This scheduling is done
1441   // in MaybeScheduleFlushOrCompaction()
1442   // invariant(column family present in flush_queue_ <==>
1443   // ColumnFamilyData::pending_flush_ == true)
1444   std::deque<FlushRequest> flush_queue_;
1445   // invariant(column family present in compaction_queue_ <==>
1446   // ColumnFamilyData::pending_compaction_ == true)
1447   std::deque<ColumnFamilyData*> compaction_queue_;
1448
1449   // A queue to store filenames of the files to be purged
1450   std::deque<PurgeFileInfo> purge_queue_;
1451
1452   // A vector to store the file numbers that have been assigned to certain
1453   // JobContext. Current implementation tracks ssts only.
1454   std::vector<uint64_t> files_grabbed_for_purge_;
1455
1456   // A queue to store log writers to close
1457   std::deque<log::Writer*> logs_to_free_queue_;
1458   int unscheduled_flushes_;
1459   int unscheduled_compactions_;
1460
1461   // count how many background compactions are running or have been scheduled in
1462   // the BOTTOM pool
1463   int bg_bottom_compaction_scheduled_;
1464
1465   // count how many background compactions are running or have been scheduled
1466   int bg_compaction_scheduled_;
1467
1468   // stores the number of compactions are currently running
1469   int num_running_compactions_;
1470
1471   // number of background memtable flush jobs, submitted to the HIGH pool
1472   int bg_flush_scheduled_;
1473
1474   // stores the number of flushes are currently running
1475   int num_running_flushes_;
1476
1477   // number of background obsolete file purge jobs, submitted to the HIGH pool
1478   int bg_purge_scheduled_;
1479
1480   // Information for a manual compaction
1481   struct ManualCompactionState {
1482     ColumnFamilyData* cfd;
1483     int input_level;
1484     int output_level;
1485     uint32_t output_path_id;
1486     Status status;
1487     bool done;
1488     bool in_progress;            // compaction request being processed?
1489     bool incomplete;             // only part of requested range compacted
1490     bool exclusive;              // current behavior of only one manual
1491     bool disallow_trivial_move;  // Force actual compaction to run
1492     const InternalKey* begin;    // nullptr means beginning of key range
1493     const InternalKey* end;      // nullptr means end of key range
1494     InternalKey* manual_end;     // how far we are compacting
1495     InternalKey tmp_storage;     // Used to keep track of compaction progress
1496     InternalKey tmp_storage1;    // Used to keep track of compaction progress
1497   };
1498   struct PrepickedCompaction {
1499     // background compaction takes ownership of `compaction`.
1500     Compaction* compaction;
1501     // caller retains ownership of `manual_compaction_state` as it is reused
1502     // across background compactions.
1503     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
1504     // task limiter token is requested during compaction picking.
1505     std::unique_ptr<TaskLimiterToken> task_token;
1506   };
1507   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
1508
1509   struct CompactionArg {
1510     // caller retains ownership of `db`.
1511     DBImpl* db;
1512     // background compaction takes ownership of `prepicked_compaction`.
1513     PrepickedCompaction* prepicked_compaction;
1514   };
1515
1516   // shall we disable deletion of obsolete files
1517   // if 0 the deletion is enabled.
1518   // if non-zero, files will not be getting deleted
1519   // This enables two different threads to call
1520   // EnableFileDeletions() and DisableFileDeletions()
1521   // without any synchronization
1522   int disable_delete_obsolete_files_;
1523
1524   // Number of times FindObsoleteFiles has found deletable files and the
1525   // corresponding call to PurgeObsoleteFiles has not yet finished.
1526   int pending_purge_obsolete_files_;
1527
1528   // last time when DeleteObsoleteFiles with full scan was executed. Originaly
1529   // initialized with startup time.
1530   uint64_t delete_obsolete_files_last_run_;
1531
1532   // last time stats were dumped to LOG
1533   std::atomic<uint64_t> last_stats_dump_time_microsec_;
1534
1535   // Each flush or compaction gets its own job id. this counter makes sure
1536   // they're unique
1537   std::atomic<int> next_job_id_;
1538
1539   // A flag indicating whether the current rocksdb database has any
1540   // data that is not yet persisted into either WAL or SST file.
1541   // Used when disableWAL is true.
1542   std::atomic<bool> has_unpersisted_data_;
1543
1544   // if an attempt was made to flush all column families that
1545   // the oldest log depends on but uncommitted data in the oldest
1546   // log prevents the log from being released.
1547   // We must attempt to free the dependent memtables again
1548   // at a later time after the transaction in the oldest
1549   // log is fully commited.
1550   bool unable_to_release_oldest_log_;
1551
1552   static const int KEEP_LOG_FILE_NUM = 1000;
1553   // MSVC version 1800 still does not have constexpr for ::max()
1554   static const uint64_t kNoTimeOut = port::kMaxUint64;
1555
1556   std::string db_absolute_path_;
1557
1558   // Number of running IngestExternalFile() calls.
1559   // REQUIRES: mutex held
1560   int num_running_ingest_file_;
1561
1562 #ifndef ROCKSDB_LITE
1563   WalManager wal_manager_;
1564 #endif  // ROCKSDB_LITE
1565
1566   // Unified interface for logging events
1567   EventLogger event_logger_;
1568
1569   // A value of > 0 temporarily disables scheduling of background work
1570   int bg_work_paused_;
1571
1572   // A value of > 0 temporarily disables scheduling of background compaction
1573   int bg_compaction_paused_;
1574
1575   // Guard against multiple concurrent refitting
1576   bool refitting_level_;
1577
1578   // Indicate DB was opened successfully
1579   bool opened_successfully_;
1580
1581   // The min threshold to triggere bottommost compaction for removing
1582   // garbages, among all column families.
1583   SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
1584
1585   LogsWithPrepTracker logs_with_prep_tracker_;
1586
1587   // Callback for compaction to check if a key is visible to a snapshot.
1588   // REQUIRES: mutex held
1589   std::unique_ptr<SnapshotChecker> snapshot_checker_;
1590
1591   // Callback for when the cached_recoverable_state_ is written to memtable
1592   // Only to be set during initialization
1593   std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
1594
1595   // handle for scheduling stats dumping at fixed intervals
1596   // REQUIRES: mutex locked
1597   std::unique_ptr<rocksdb::RepeatableThread> thread_dump_stats_;
1598
1599   // handle for scheduling stats snapshoting at fixed intervals
1600   // REQUIRES: mutex locked
1601   std::unique_ptr<rocksdb::RepeatableThread> thread_persist_stats_;
1602
1603   // No copying allowed
1604   DBImpl(const DBImpl&);
1605   void operator=(const DBImpl&);
1606
1607   // Background threads call this function, which is just a wrapper around
1608   // the InstallSuperVersion() function. Background threads carry
1609   // sv_context which can have new_superversion already
1610   // allocated.
1611   // All ColumnFamily state changes go through this function. Here we analyze
1612   // the new state and we schedule background work if we detect that the new
1613   // state needs flush or compaction.
1614   void InstallSuperVersionAndScheduleWork(
1615       ColumnFamilyData* cfd, SuperVersionContext* sv_context,
1616       const MutableCFOptions& mutable_cf_options);
1617
1618 #ifndef ROCKSDB_LITE
1619   using DB::GetPropertiesOfAllTables;
1620   virtual Status GetPropertiesOfAllTables(
1621       ColumnFamilyHandle* column_family,
1622       TablePropertiesCollection* props) override;
1623   virtual Status GetPropertiesOfTablesInRange(
1624       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
1625       TablePropertiesCollection* props) override;
1626
1627 #endif  // ROCKSDB_LITE
1628
1629   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
1630                               const DBPropertyInfo& property_info,
1631                               bool is_locked, uint64_t* value);
1632   bool GetPropertyHandleOptionsStatistics(std::string* value);
1633
1634   bool HasPendingManualCompaction();
1635   bool HasExclusiveManualCompaction();
1636   void AddManualCompaction(ManualCompactionState* m);
1637   void RemoveManualCompaction(ManualCompactionState* m);
1638   bool ShouldntRunManualCompaction(ManualCompactionState* m);
1639   bool HaveManualCompaction(ColumnFamilyData* cfd);
1640   bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
1641 #ifndef ROCKSDB_LITE
1642   void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
1643                               const Status& st,
1644                               const CompactionJobStats& compaction_job_stats,
1645                               const int job_id, const Version* current,
1646                               CompactionJobInfo* compaction_job_info) const;
1647   // Reserve the next 'num' file numbers for to-be-ingested external SST files,
1648   // and return the current file_number in 'next_file_number'.
1649   // Write a version edit to the MANIFEST.
1650   Status ReserveFileNumbersBeforeIngestion(
1651       ColumnFamilyData* cfd, uint64_t num,
1652       std::list<uint64_t>::iterator* pending_output_elem,
1653       uint64_t* next_file_number);
1654 #endif  //! ROCKSDB_LITE
1655
1656   bool ShouldPurge(uint64_t file_number) const;
1657   void MarkAsGrabbedForPurge(uint64_t file_number);
1658
1659   size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
1660   Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
1661
1662   // When set, we use a separate queue for writes that dont write to memtable.
1663   // In 2PC these are the writes at Prepare phase.
1664   const bool two_write_queues_;
1665   const bool manual_wal_flush_;
1666   // Increase the sequence number after writing each batch, whether memtable is
1667   // disabled for that or not. Otherwise the sequence number is increased after
1668   // writing each key into memtable. This implies that when disable_memtable is
1669   // set, the seq is not increased at all.
1670   //
1671   // Default: false
1672   const bool seq_per_batch_;
1673   // This determines during recovery whether we expect one writebatch per
1674   // recovered transaction, or potentially multiple writebatches per
1675   // transaction. For WriteUnprepared, this is set to false, since multiple
1676   // batches can exist per transaction.
1677   //
1678   // Default: true
1679   const bool batch_per_txn_;
1680   // LastSequence also indicates last published sequence visibile to the
1681   // readers. Otherwise LastPublishedSequence should be used.
1682   const bool last_seq_same_as_publish_seq_;
1683   // It indicates that a customized gc algorithm must be used for
1684   // flush/compaction and if it is not provided vis SnapshotChecker, we should
1685   // disable gc to be safe.
1686   const bool use_custom_gc_;
1687   // Flag to indicate that the DB instance shutdown has been initiated. This
1688   // different from shutting_down_ atomic in that it is set at the beginning
1689   // of shutdown sequence, specifically in order to prevent any background
1690   // error recovery from going on in parallel. The latter, shutting_down_,
1691   // is set a little later during the shutdown after scheduling memtable
1692   // flushes
1693   std::atomic<bool> shutdown_initiated_;
1694   // Flag to indicate whether sst_file_manager object was allocated in
1695   // DB::Open() or passed to us
1696   bool own_sfm_;
1697
1698   // Clients must periodically call SetPreserveDeletesSequenceNumber()
1699   // to advance this seqnum. Default value is 0 which means ALL deletes are
1700   // preserved. Note that this has no effect if DBOptions.preserve_deletes
1701   // is set to false.
1702   std::atomic<SequenceNumber> preserve_deletes_seqnum_;
1703   const bool preserve_deletes_;
1704
1705   // Flag to check whether Close() has been called on this DB
1706   bool closed_;
1707
1708   ErrorHandler error_handler_;
1709
1710   // Conditional variable to coordinate installation of atomic flush results.
1711   // With atomic flush, each bg thread installs the result of flushing multiple
1712   // column families, and different threads can flush different column
1713   // families. It's difficult to rely on one thread to perform batch
1714   // installation for all threads. This is different from the non-atomic flush
1715   // case.
1716   // atomic_flush_install_cv_ makes sure that threads install atomic flush
1717   // results sequentially. Flush results of memtables with lower IDs get
1718   // installed to MANIFEST first.
1719   InstrumentedCondVar atomic_flush_install_cv_;
1720 };
1721
1722 extern Options SanitizeOptions(const std::string& db, const Options& src);
1723
1724 extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
1725
1726 extern CompressionType GetCompressionFlush(
1727     const ImmutableCFOptions& ioptions,
1728     const MutableCFOptions& mutable_cf_options);
1729
1730 // Return the earliest log file to keep after the memtable flush is
1731 // finalized.
1732 // `cfd_to_flush` is the column family whose memtable (specified in
1733 // `memtables_to_flush`) will be flushed and thus will not depend on any WAL
1734 // file.
1735 // The function is only applicable to 2pc mode.
1736 extern uint64_t PrecomputeMinLogNumberToKeep(
1737     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
1738     autovector<VersionEdit*> edit_list,
1739     const autovector<MemTable*>& memtables_to_flush,
1740     LogsWithPrepTracker* prep_tracker);
1741
1742 // `cfd_to_flush` is the column family whose memtable will be flushed and thus
1743 // will not depend on any WAL file. nullptr means no memtable is being flushed.
1744 // The function is only applicable to 2pc mode.
1745 extern uint64_t FindMinPrepLogReferencedByMemTable(
1746     VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
1747     const autovector<MemTable*>& memtables_to_flush);
1748
1749 // Fix user-supplied options to be reasonable
1750 template <class T, class V>
1751 static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
1752   if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
1753   if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
1754 }
1755
1756 }  // namespace rocksdb