ceph/src/rocksdb/db/db_impl/db_impl.h

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9 #pragma once
  10
  11 #include <atomic>
  12 #include <deque>
  13 #include <functional>
  14 #include <limits>
  15 #include <list>
  16 #include <map>
  17 #include <set>
  18 #include <string>
  19 #include <utility>
  20 #include <vector>
  21
  22 #include "db/column_family.h"
  23 #include "db/compaction/compaction_iterator.h"
  24 #include "db/compaction/compaction_job.h"
  25 #include "db/error_handler.h"
  26 #include "db/event_helpers.h"
  27 #include "db/external_sst_file_ingestion_job.h"
  28 #include "db/flush_job.h"
  29 #include "db/flush_scheduler.h"
  30 #include "db/import_column_family_job.h"
  31 #include "db/internal_stats.h"
  32 #include "db/log_writer.h"
  33 #include "db/logs_with_prep_tracker.h"
  34 #include "db/memtable_list.h"
  35 #include "db/periodic_task_scheduler.h"
  36 #include "db/post_memtable_callback.h"
  37 #include "db/pre_release_callback.h"
  38 #include "db/range_del_aggregator.h"
  39 #include "db/read_callback.h"
  40 #include "db/seqno_to_time_mapping.h"
  41 #include "db/snapshot_checker.h"
  42 #include "db/snapshot_impl.h"
  43 #include "db/trim_history_scheduler.h"
  44 #include "db/version_edit.h"
  45 #include "db/wal_manager.h"
  46 #include "db/write_controller.h"
  47 #include "db/write_thread.h"
  48 #include "logging/event_logger.h"
  49 #include "monitoring/instrumented_mutex.h"
  50 #include "options/db_options.h"
  51 #include "port/port.h"
  52 #include "rocksdb/db.h"
  53 #include "rocksdb/env.h"
  54 #include "rocksdb/memtablerep.h"
  55 #include "rocksdb/status.h"
  56 #ifndef ROCKSDB_LITE
  57 #include "rocksdb/trace_reader_writer.h"
  58 #endif  // ROCKSDB_LITE
  59 #include "rocksdb/transaction_log.h"
  60 #ifndef ROCKSDB_LITE
  61 #include "rocksdb/utilities/replayer.h"
  62 #endif  // ROCKSDB_LITE
  63 #include "rocksdb/write_buffer_manager.h"
  64 #include "table/merging_iterator.h"
  65 #include "table/scoped_arena_iterator.h"
  66 #include "util/autovector.h"
  67 #include "util/hash.h"
  68 #include "util/repeatable_thread.h"
  69 #include "util/stop_watch.h"
  70 #include "util/thread_local.h"
  71
  72 namespace ROCKSDB_NAMESPACE {
  73
  74 class Arena;
  75 class ArenaWrappedDBIter;
  76 class InMemoryStatsHistoryIterator;
  77 class MemTable;
  78 class PersistentStatsHistoryIterator;
  79 class TableCache;
  80 class TaskLimiterToken;
  81 class Version;
  82 class VersionEdit;
  83 class VersionSet;
  84 class WriteCallback;
  85 struct JobContext;
  86 struct ExternalSstFileInfo;
  87 struct MemTableInfo;
  88
  89 // Class to maintain directories for all database paths other than main one.
  90 class Directories {
  91  public:
  92   IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
  93                           const std::string& wal_dir,
  94                           const std::vector<DbPath>& data_paths);
  95
  96   FSDirectory* GetDataDir(size_t path_id) const {
  97     assert(path_id < data_dirs_.size());
  98     FSDirectory* ret_dir = data_dirs_[path_id].get();
  99     if (ret_dir == nullptr) {
 100       // Should use db_dir_
 101       return db_dir_.get();
 102     }
 103     return ret_dir;
 104   }
 105
 106   FSDirectory* GetWalDir() {
 107     if (wal_dir_) {
 108       return wal_dir_.get();
 109     }
 110     return db_dir_.get();
 111   }
 112
 113   FSDirectory* GetDbDir() { return db_dir_.get(); }
 114
 115   IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
 116     // close all directories for all database paths
 117     IOStatus s = IOStatus::OK();
 118
 119     // The default implementation for Close() in Directory/FSDirectory class
 120     // "NotSupported" status, the upper level interface should be able to
 121     // handle this error so that Close() does not fail after upgrading when
 122     // run on FileSystems that have not implemented `Directory::Close()` or
 123     // `FSDirectory::Close()` yet
 124
 125     if (db_dir_) {
 126       IOStatus temp_s = db_dir_->Close(options, dbg);
 127       if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
 128         s = std::move(temp_s);
 129       }
 130     }
 131
 132     // Attempt to close everything even if one fails
 133     s.PermitUncheckedError();
 134
 135     if (wal_dir_) {
 136       IOStatus temp_s = wal_dir_->Close(options, dbg);
 137       if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
 138         s = std::move(temp_s);
 139       }
 140     }
 141
 142     s.PermitUncheckedError();
 143
 144     for (auto& data_dir_ptr : data_dirs_) {
 145       if (data_dir_ptr) {
 146         IOStatus temp_s = data_dir_ptr->Close(options, dbg);
 147         if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
 148           s = std::move(temp_s);
 149         }
 150       }
 151     }
 152
 153     // Ready for caller
 154     s.MustCheck();
 155     return s;
 156   }
 157
 158  private:
 159   std::unique_ptr<FSDirectory> db_dir_;
 160   std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
 161   std::unique_ptr<FSDirectory> wal_dir_;
 162 };
 163
 164 // While DB is the public interface of RocksDB, and DBImpl is the actual
 165 // class implementing it. It's the entrance of the core RocksdB engine.
 166 // All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
 167 // DBImpl internally.
 168 // Other than functions implementing the DB interface, some public
 169 // functions are there for other internal components to call. For
 170 // example, TransactionDB directly calls DBImpl::WriteImpl() and
 171 // BlobDB directly calls DBImpl::GetImpl(). Some other functions
 172 // are for sub-components to call. For example, ColumnFamilyHandleImpl
 173 // calls DBImpl::FindObsoleteFiles().
 174 //
 175 // Since it's a very large class, the definition of the functions is
 176 // divided in several db_impl_*.cc files, besides db_impl.cc.
 177 class DBImpl : public DB {
 178  public:
 179   DBImpl(const DBOptions& options, const std::string& dbname,
 180          const bool seq_per_batch = false, const bool batch_per_txn = true,
 181          bool read_only = false);
 182   // No copying allowed
 183   DBImpl(const DBImpl&) = delete;
 184   void operator=(const DBImpl&) = delete;
 185
 186   virtual ~DBImpl();
 187
 188   // ---- Implementations of the DB interface ----
 189
 190   using DB::Resume;
 191   Status Resume() override;
 192
 193   using DB::Put;
 194   Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
 195              const Slice& key, const Slice& value) override;
 196   Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
 197              const Slice& key, const Slice& ts, const Slice& value) override;
 198
 199   using DB::PutEntity;
 200   Status PutEntity(const WriteOptions& options,
 201                    ColumnFamilyHandle* column_family, const Slice& key,
 202                    const WideColumns& columns) override;
 203
 204   using DB::Merge;
 205   Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
 206                const Slice& key, const Slice& value) override;
 207   Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
 208                const Slice& key, const Slice& ts, const Slice& value) override;
 209
 210   using DB::Delete;
 211   Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
 212                 const Slice& key) override;
 213   Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
 214                 const Slice& key, const Slice& ts) override;
 215
 216   using DB::SingleDelete;
 217   Status SingleDelete(const WriteOptions& options,
 218                       ColumnFamilyHandle* column_family,
 219                       const Slice& key) override;
 220   Status SingleDelete(const WriteOptions& options,
 221                       ColumnFamilyHandle* column_family, const Slice& key,
 222                       const Slice& ts) override;
 223
 224   using DB::DeleteRange;
 225   Status DeleteRange(const WriteOptions& options,
 226                      ColumnFamilyHandle* column_family, const Slice& begin_key,
 227                      const Slice& end_key) override;
 228   Status DeleteRange(const WriteOptions& options,
 229                      ColumnFamilyHandle* column_family, const Slice& begin_key,
 230                      const Slice& end_key, const Slice& ts) override;
 231
 232   using DB::Write;
 233   virtual Status Write(const WriteOptions& options,
 234                        WriteBatch* updates) override;
 235
 236   using DB::Get;
 237   virtual Status Get(const ReadOptions& options,
 238                      ColumnFamilyHandle* column_family, const Slice& key,
 239                      PinnableSlice* value) override;
 240   virtual Status Get(const ReadOptions& options,
 241                      ColumnFamilyHandle* column_family, const Slice& key,
 242                      PinnableSlice* value, std::string* timestamp) override;
 243
 244   using DB::GetEntity;
 245   Status GetEntity(const ReadOptions& options,
 246                    ColumnFamilyHandle* column_family, const Slice& key,
 247                    PinnableWideColumns* columns) override;
 248
 249   using DB::GetMergeOperands;
 250   Status GetMergeOperands(const ReadOptions& options,
 251                           ColumnFamilyHandle* column_family, const Slice& key,
 252                           PinnableSlice* merge_operands,
 253                           GetMergeOperandsOptions* get_merge_operands_options,
 254                           int* number_of_operands) override {
 255     GetImplOptions get_impl_options;
 256     get_impl_options.column_family = column_family;
 257     get_impl_options.merge_operands = merge_operands;
 258     get_impl_options.get_merge_operands_options = get_merge_operands_options;
 259     get_impl_options.number_of_operands = number_of_operands;
 260     get_impl_options.get_value = false;
 261     return GetImpl(options, key, get_impl_options);
 262   }
 263
 264   using DB::MultiGet;
 265   virtual std::vector<Status> MultiGet(
 266       const ReadOptions& options,
 267       const std::vector<ColumnFamilyHandle*>& column_family,
 268       const std::vector<Slice>& keys,
 269       std::vector<std::string>* values) override;
 270   virtual std::vector<Status> MultiGet(
 271       const ReadOptions& options,
 272       const std::vector<ColumnFamilyHandle*>& column_family,
 273       const std::vector<Slice>& keys, std::vector<std::string>* values,
 274       std::vector<std::string>* timestamps) override;
 275
 276   // This MultiGet is a batched version, which may be faster than calling Get
 277   // multiple times, especially if the keys have some spatial locality that
 278   // enables them to be queried in the same SST files/set of files. The larger
 279   // the batch size, the more scope for batching and performance improvement
 280   // The values and statuses parameters are arrays with number of elements
 281   // equal to keys.size(). This allows the storage for those to be alloacted
 282   // by the caller on the stack for small batches
 283   virtual void MultiGet(const ReadOptions& options,
 284                         ColumnFamilyHandle* column_family,
 285                         const size_t num_keys, const Slice* keys,
 286                         PinnableSlice* values, Status* statuses,
 287                         const bool sorted_input = false) override;
 288   virtual void MultiGet(const ReadOptions& options,
 289                         ColumnFamilyHandle* column_family,
 290                         const size_t num_keys, const Slice* keys,
 291                         PinnableSlice* values, std::string* timestamps,
 292                         Status* statuses,
 293                         const bool sorted_input = false) override;
 294
 295   virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
 296                         ColumnFamilyHandle** column_families, const Slice* keys,
 297                         PinnableSlice* values, Status* statuses,
 298                         const bool sorted_input = false) override;
 299   virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
 300                         ColumnFamilyHandle** column_families, const Slice* keys,
 301                         PinnableSlice* values, std::string* timestamps,
 302                         Status* statuses,
 303                         const bool sorted_input = false) override;
 304
 305   virtual void MultiGetWithCallback(
 306       const ReadOptions& options, ColumnFamilyHandle* column_family,
 307       ReadCallback* callback,
 308       autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
 309
 310   virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
 311                                     const std::string& column_family,
 312                                     ColumnFamilyHandle** handle) override;
 313   virtual Status CreateColumnFamilies(
 314       const ColumnFamilyOptions& cf_options,
 315       const std::vector<std::string>& column_family_names,
 316       std::vector<ColumnFamilyHandle*>* handles) override;
 317   virtual Status CreateColumnFamilies(
 318       const std::vector<ColumnFamilyDescriptor>& column_families,
 319       std::vector<ColumnFamilyHandle*>* handles) override;
 320   virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
 321   virtual Status DropColumnFamilies(
 322       const std::vector<ColumnFamilyHandle*>& column_families) override;
 323
 324   // Returns false if key doesn't exist in the database and true if it may.
 325   // If value_found is not passed in as null, then return the value if found in
 326   // memory. On return, if value was found, then value_found will be set to true
 327   // , otherwise false.
 328   using DB::KeyMayExist;
 329   virtual bool KeyMayExist(const ReadOptions& options,
 330                            ColumnFamilyHandle* column_family, const Slice& key,
 331                            std::string* value, std::string* timestamp,
 332                            bool* value_found = nullptr) override;
 333
 334   using DB::NewIterator;
 335   virtual Iterator* NewIterator(const ReadOptions& options,
 336                                 ColumnFamilyHandle* column_family) override;
 337   virtual Status NewIterators(
 338       const ReadOptions& options,
 339       const std::vector<ColumnFamilyHandle*>& column_families,
 340       std::vector<Iterator*>* iterators) override;
 341
 342   virtual const Snapshot* GetSnapshot() override;
 343   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
 344   // Create a timestamped snapshot. This snapshot can be shared by multiple
 345   // readers. If any of them uses it for write conflict checking, then
 346   // is_write_conflict_boundary is true. For simplicity, set it to true by
 347   // default.
 348   std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
 349       SequenceNumber snapshot_seq, uint64_t ts);
 350   std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
 351   void ReleaseTimestampedSnapshotsOlderThan(
 352       uint64_t ts, size_t* remaining_total_ss = nullptr);
 353   Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
 354                                  std::vector<std::shared_ptr<const Snapshot>>&
 355                                      timestamped_snapshots) const;
 356
 357   using DB::GetProperty;
 358   virtual bool GetProperty(ColumnFamilyHandle* column_family,
 359                            const Slice& property, std::string* value) override;
 360   using DB::GetMapProperty;
 361   virtual bool GetMapProperty(
 362       ColumnFamilyHandle* column_family, const Slice& property,
 363       std::map<std::string, std::string>* value) override;
 364   using DB::GetIntProperty;
 365   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
 366                               const Slice& property, uint64_t* value) override;
 367   using DB::GetAggregatedIntProperty;
 368   virtual bool GetAggregatedIntProperty(const Slice& property,
 369                                         uint64_t* aggregated_value) override;
 370   using DB::GetApproximateSizes;
 371   virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
 372                                      ColumnFamilyHandle* column_family,
 373                                      const Range* range, int n,
 374                                      uint64_t* sizes) override;
 375   using DB::GetApproximateMemTableStats;
 376   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
 377                                            const Range& range,
 378                                            uint64_t* const count,
 379                                            uint64_t* const size) override;
 380   using DB::CompactRange;
 381   virtual Status CompactRange(const CompactRangeOptions& options,
 382                               ColumnFamilyHandle* column_family,
 383                               const Slice* begin, const Slice* end) override;
 384
 385   using DB::CompactFiles;
 386   virtual Status CompactFiles(
 387       const CompactionOptions& compact_options,
 388       ColumnFamilyHandle* column_family,
 389       const std::vector<std::string>& input_file_names, const int output_level,
 390       const int output_path_id = -1,
 391       std::vector<std::string>* const output_file_names = nullptr,
 392       CompactionJobInfo* compaction_job_info = nullptr) override;
 393
 394   virtual Status PauseBackgroundWork() override;
 395   virtual Status ContinueBackgroundWork() override;
 396
 397   virtual Status EnableAutoCompaction(
 398       const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
 399
 400   virtual void EnableManualCompaction() override;
 401   virtual void DisableManualCompaction() override;
 402
 403   using DB::SetOptions;
 404   Status SetOptions(
 405       ColumnFamilyHandle* column_family,
 406       const std::unordered_map<std::string, std::string>& options_map) override;
 407
 408   virtual Status SetDBOptions(
 409       const std::unordered_map<std::string, std::string>& options_map) override;
 410
 411   using DB::NumberLevels;
 412   virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
 413   using DB::MaxMemCompactionLevel;
 414   virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
 415   using DB::Level0StopWriteTrigger;
 416   virtual int Level0StopWriteTrigger(
 417       ColumnFamilyHandle* column_family) override;
 418   virtual const std::string& GetName() const override;
 419   virtual Env* GetEnv() const override;
 420   virtual FileSystem* GetFileSystem() const override;
 421   using DB::GetOptions;
 422   virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
 423   using DB::GetDBOptions;
 424   virtual DBOptions GetDBOptions() const override;
 425   using DB::Flush;
 426   virtual Status Flush(const FlushOptions& options,
 427                        ColumnFamilyHandle* column_family) override;
 428   virtual Status Flush(
 429       const FlushOptions& options,
 430       const std::vector<ColumnFamilyHandle*>& column_families) override;
 431   virtual Status FlushWAL(bool sync) override;
 432   bool WALBufferIsEmpty(bool lock = true);
 433   virtual Status SyncWAL() override;
 434   virtual Status LockWAL() override;
 435   virtual Status UnlockWAL() override;
 436
 437   virtual SequenceNumber GetLatestSequenceNumber() const override;
 438
 439   // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
 440   // and release db_mutex
 441   Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
 442                                   std::string ts_low) override;
 443
 444   // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
 445   // release db_mutex
 446   Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
 447                              std::string* ts_low) override;
 448
 449   virtual Status GetDbIdentity(std::string& identity) const override;
 450
 451   virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
 452
 453   virtual Status GetDbSessionId(std::string& session_id) const override;
 454
 455   ColumnFamilyHandle* DefaultColumnFamily() const override;
 456
 457   ColumnFamilyHandle* PersistentStatsColumnFamily() const;
 458
 459   virtual Status Close() override;
 460
 461   virtual Status DisableFileDeletions() override;
 462
 463   virtual Status EnableFileDeletions(bool force) override;
 464
 465   virtual bool IsFileDeletionsEnabled() const;
 466
 467   Status GetStatsHistory(
 468       uint64_t start_time, uint64_t end_time,
 469       std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
 470
 471 #ifndef ROCKSDB_LITE
 472   using DB::ResetStats;
 473   virtual Status ResetStats() override;
 474   // All the returned filenames start with "/"
 475   virtual Status GetLiveFiles(std::vector<std::string>&,
 476                               uint64_t* manifest_file_size,
 477                               bool flush_memtable = true) override;
 478   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
 479   virtual Status GetCurrentWalFile(
 480       std::unique_ptr<LogFile>* current_log_file) override;
 481   virtual Status GetCreationTimeOfOldestFile(
 482       uint64_t* creation_time) override;
 483
 484   virtual Status GetUpdatesSince(
 485       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
 486       const TransactionLogIterator::ReadOptions& read_options =
 487           TransactionLogIterator::ReadOptions()) override;
 488   virtual Status DeleteFile(std::string name) override;
 489   Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
 490                              const RangePtr* ranges, size_t n,
 491                              bool include_end = true);
 492
 493   virtual void GetLiveFilesMetaData(
 494       std::vector<LiveFileMetaData>* metadata) override;
 495
 496   virtual Status GetLiveFilesChecksumInfo(
 497       FileChecksumList* checksum_list) override;
 498
 499   virtual Status GetLiveFilesStorageInfo(
 500       const LiveFilesStorageInfoOptions& opts,
 501       std::vector<LiveFileStorageInfo>* files) override;
 502
 503   // Obtains the meta data of the specified column family of the DB.
 504   // TODO(yhchiang): output parameter is placed in the end in this codebase.
 505   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
 506                                        ColumnFamilyMetaData* metadata) override;
 507
 508   void GetAllColumnFamilyMetaData(
 509       std::vector<ColumnFamilyMetaData>* metadata) override;
 510
 511   Status SuggestCompactRange(ColumnFamilyHandle* column_family,
 512                              const Slice* begin, const Slice* end) override;
 513
 514   Status PromoteL0(ColumnFamilyHandle* column_family,
 515                    int target_level) override;
 516
 517   using DB::IngestExternalFile;
 518   virtual Status IngestExternalFile(
 519       ColumnFamilyHandle* column_family,
 520       const std::vector<std::string>& external_files,
 521       const IngestExternalFileOptions& ingestion_options) override;
 522
 523   using DB::IngestExternalFiles;
 524   virtual Status IngestExternalFiles(
 525       const std::vector<IngestExternalFileArg>& args) override;
 526
 527   using DB::CreateColumnFamilyWithImport;
 528   virtual Status CreateColumnFamilyWithImport(
 529       const ColumnFamilyOptions& options, const std::string& column_family_name,
 530       const ImportColumnFamilyOptions& import_options,
 531       const ExportImportFilesMetaData& metadata,
 532       ColumnFamilyHandle** handle) override;
 533
 534   using DB::VerifyFileChecksums;
 535   Status VerifyFileChecksums(const ReadOptions& read_options) override;
 536
 537   using DB::VerifyChecksum;
 538   virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
 539   // Verify the checksums of files in db. Currently only tables are checked.
 540   //
 541   // read_options: controls file I/O behavior, e.g. read ahead size while
 542   //               reading all the live table files.
 543   //
 544   // use_file_checksum: if false, verify the block checksums of all live table
 545   //                    in db. Otherwise, obtain the file checksums and compare
 546   //                    with the MANIFEST. Currently, file checksums are
 547   //                    recomputed by reading all table files.
 548   //
 549   // Returns: OK if there is no file whose file or block checksum mismatches.
 550   Status VerifyChecksumInternal(const ReadOptions& read_options,
 551                                 bool use_file_checksum);
 552
 553   Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
 554                                 const std::string& func_name_expected,
 555                                 const std::string& fpath,
 556                                 const ReadOptions& read_options);
 557
 558   using DB::StartTrace;
 559   virtual Status StartTrace(
 560       const TraceOptions& options,
 561       std::unique_ptr<TraceWriter>&& trace_writer) override;
 562
 563   using DB::EndTrace;
 564   virtual Status EndTrace() override;
 565
 566   using DB::NewDefaultReplayer;
 567   virtual Status NewDefaultReplayer(
 568       const std::vector<ColumnFamilyHandle*>& handles,
 569       std::unique_ptr<TraceReader>&& reader,
 570       std::unique_ptr<Replayer>* replayer) override;
 571
 572   using DB::StartBlockCacheTrace;
 573   Status StartBlockCacheTrace(
 574       const TraceOptions& trace_options,
 575       std::unique_ptr<TraceWriter>&& trace_writer) override;
 576
 577   Status StartBlockCacheTrace(
 578       const BlockCacheTraceOptions& options,
 579       std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
 580
 581   using DB::EndBlockCacheTrace;
 582   Status EndBlockCacheTrace() override;
 583
 584   using DB::StartIOTrace;
 585   Status StartIOTrace(const TraceOptions& options,
 586                       std::unique_ptr<TraceWriter>&& trace_writer) override;
 587
 588   using DB::EndIOTrace;
 589   Status EndIOTrace() override;
 590
 591   using DB::GetPropertiesOfAllTables;
 592   virtual Status GetPropertiesOfAllTables(
 593       ColumnFamilyHandle* column_family,
 594       TablePropertiesCollection* props) override;
 595   virtual Status GetPropertiesOfTablesInRange(
 596       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
 597       TablePropertiesCollection* props) override;
 598
 599 #endif  // ROCKSDB_LITE
 600
 601   // ---- End of implementations of the DB interface ----
 602   SystemClock* GetSystemClock() const;
 603
 604   struct GetImplOptions {
 605     ColumnFamilyHandle* column_family = nullptr;
 606     PinnableSlice* value = nullptr;
 607     PinnableWideColumns* columns = nullptr;
 608     std::string* timestamp = nullptr;
 609     bool* value_found = nullptr;
 610     ReadCallback* callback = nullptr;
 611     bool* is_blob_index = nullptr;
 612     // If true return value associated with key via value pointer else return
 613     // all merge operands for key via merge_operands pointer
 614     bool get_value = true;
 615     // Pointer to an array of size
 616     // get_merge_operands_options.expected_max_number_of_operands allocated by
 617     // user
 618     PinnableSlice* merge_operands = nullptr;
 619     GetMergeOperandsOptions* get_merge_operands_options = nullptr;
 620     int* number_of_operands = nullptr;
 621   };
 622
 623   // Function that Get and KeyMayExist call with no_io true or false
 624   // Note: 'value_found' from KeyMayExist propagates here
 625   // This function is also called by GetMergeOperands
 626   // If get_impl_options.get_value = true get value associated with
 627   // get_impl_options.key via get_impl_options.value
 628   // If get_impl_options.get_value = false get merge operands associated with
 629   // get_impl_options.key via get_impl_options.merge_operands
 630   Status GetImpl(const ReadOptions& options, const Slice& key,
 631                  GetImplOptions& get_impl_options);
 632
 633   // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
 634   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
 635                                       ColumnFamilyData* cfd,
 636                                       SequenceNumber snapshot,
 637                                       ReadCallback* read_callback,
 638                                       bool expose_blob_index = false,
 639                                       bool allow_refresh = true);
 640
 641   virtual SequenceNumber GetLastPublishedSequence() const {
 642     if (last_seq_same_as_publish_seq_) {
 643       return versions_->LastSequence();
 644     } else {
 645       return versions_->LastPublishedSequence();
 646     }
 647   }
 648
 649   // REQUIRES: joined the main write queue if two_write_queues is disabled, and
 650   // the second write queue otherwise.
 651   virtual void SetLastPublishedSequence(SequenceNumber seq);
 652   // Returns LastSequence in last_seq_same_as_publish_seq_
 653   // mode and LastAllocatedSequence otherwise. This is useful when visiblility
 654   // depends also on data written to the WAL but not to the memtable.
 655   SequenceNumber TEST_GetLastVisibleSequence() const;
 656
 657 #ifndef ROCKSDB_LITE
 658   // Similar to Write() but will call the callback once on the single write
 659   // thread to determine whether it is safe to perform the write.
 660   virtual Status WriteWithCallback(const WriteOptions& write_options,
 661                                    WriteBatch* my_batch,
 662                                    WriteCallback* callback);
 663
 664   // Returns the sequence number that is guaranteed to be smaller than or equal
 665   // to the sequence number of any key that could be inserted into the current
 666   // memtables. It can then be assumed that any write with a larger(or equal)
 667   // sequence number will be present in this memtable or a later memtable.
 668   //
 669   // If the earliest sequence number could not be determined,
 670   // kMaxSequenceNumber will be returned.
 671   //
 672   // If include_history=true, will also search Memtables in MemTableList
 673   // History.
 674   SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
 675                                                    bool include_history);
 676
 677   // For a given key, check to see if there are any records for this key
 678   // in the memtables, including memtable history.  If cache_only is false,
 679   // SST files will also be checked.
 680   //
 681   // `key` should NOT have user-defined timestamp appended to user key even if
 682   // timestamp is enabled.
 683   //
 684   // If a key is found, *found_record_for_key will be set to true and
 685   // *seq will be set to the stored sequence number for the latest
 686   // operation on this key or kMaxSequenceNumber if unknown. If user-defined
 687   // timestamp is enabled for this column family and timestamp is not nullptr,
 688   // then *timestamp will be set to the stored timestamp for the latest
 689   // operation on this key.
 690   // If no key is found, *found_record_for_key will be set to false.
 691   //
 692   // Note: If cache_only=false, it is possible for *seq to be set to 0 if
 693   // the sequence number has been cleared from the record.  If the caller is
 694   // holding an active db snapshot, we know the missing sequence must be less
 695   // than the snapshot's sequence number (sequence numbers are only cleared
 696   // when there are no earlier active snapshots).
 697   //
 698   // If NotFound is returned and found_record_for_key is set to false, then no
 699   // record for this key was found.  If the caller is holding an active db
 700   // snapshot, we know that no key could have existing after this snapshot
 701   // (since we do not compact keys that have an earlier snapshot).
 702   //
 703   // Only records newer than or at `lower_bound_seq` are guaranteed to be
 704   // returned. Memtables and files may not be checked if it only contains data
 705   // older than `lower_bound_seq`.
 706   //
 707   // Returns OK or NotFound on success,
 708   // other status on unexpected error.
 709   // TODO(andrewkr): this API need to be aware of range deletion operations
 710   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
 711                                  bool cache_only,
 712                                  SequenceNumber lower_bound_seq,
 713                                  SequenceNumber* seq, std::string* timestamp,
 714                                  bool* found_record_for_key,
 715                                  bool* is_blob_index);
 716
 717   Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
 718                            const Slice& lower_bound, const Slice upper_bound);
 719   Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
 720                                   const Slice& lower_bound,
 721                                   const Slice upper_bound);
 722 #endif  // ROCKSDB_LITE
 723
 724   // Similar to GetSnapshot(), but also lets the db know that this snapshot
 725   // will be used for transaction write-conflict checking.  The DB can then
 726   // make sure not to compact any keys that would prevent a write-conflict from
 727   // being detected.
 728   const Snapshot* GetSnapshotForWriteConflictBoundary();
 729
 730   // checks if all live files exist on file system and that their file sizes
 731   // match to our in-memory records
 732   virtual Status CheckConsistency();
 733
 734   // max_file_num_to_ignore allows bottom level compaction to filter out newly
 735   // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
 736   // disable the filtering
 737   Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 738                              int output_level,
 739                              const CompactRangeOptions& compact_range_options,
 740                              const Slice* begin, const Slice* end,
 741                              bool exclusive, bool disallow_trivial_move,
 742                              uint64_t max_file_num_to_ignore,
 743                              const std::string& trim_ts);
 744
 745   // Return an internal iterator over the current state of the database.
 746   // The keys of this iterator are internal keys (see format.h).
 747   // The returned iterator should be deleted when no longer needed.
 748   // If allow_unprepared_value is true, the returned iterator may defer reading
 749   // the value and so will require PrepareValue() to be called before value();
 750   // allow_unprepared_value = false is convenient when this optimization is not
 751   // useful, e.g. when reading the whole column family.
 752   //
 753   // read_options.ignore_range_deletions determines whether range tombstones are
 754   // processed in the returned interator internally, i.e., whether range
 755   // tombstone covered keys are in this iterator's output.
 756   // @param read_options Must outlive the returned iterator.
 757   InternalIterator* NewInternalIterator(
 758       const ReadOptions& read_options, Arena* arena, SequenceNumber sequence,
 759       ColumnFamilyHandle* column_family = nullptr,
 760       bool allow_unprepared_value = false);
 761
 762   // Note: to support DB iterator refresh, memtable range tombstones in the
 763   // underlying merging iterator needs to be refreshed. If db_iter is not
 764   // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the
 765   // memtable range tombstone iterator used by the underlying merging iterator.
 766   // This range tombstone iterator can be refreshed later by db_iter.
 767   // @param read_options Must outlive the returned iterator.
 768   InternalIterator* NewInternalIterator(const ReadOptions& read_options,
 769                                         ColumnFamilyData* cfd,
 770                                         SuperVersion* super_version,
 771                                         Arena* arena, SequenceNumber sequence,
 772                                         bool allow_unprepared_value,
 773                                         ArenaWrappedDBIter* db_iter = nullptr);
 774
 775   LogsWithPrepTracker* logs_with_prep_tracker() {
 776     return &logs_with_prep_tracker_;
 777   }
 778
 779   struct BGJobLimits {
 780     int max_flushes;
 781     int max_compactions;
 782   };
 783   // Returns maximum background flushes and compactions allowed to be scheduled
 784   BGJobLimits GetBGJobLimits() const;
 785   // Need a static version that can be called during SanitizeOptions().
 786   static BGJobLimits GetBGJobLimits(int max_background_flushes,
 787                                     int max_background_compactions,
 788                                     int max_background_jobs,
 789                                     bool parallelize_compactions);
 790
 791   // move logs pending closing from job_context to the DB queue and
 792   // schedule a purge
 793   void ScheduleBgLogWriterClose(JobContext* job_context);
 794
 795   uint64_t MinLogNumberToKeep();
 796
 797   // Returns the lower bound file number for SSTs that won't be deleted, even if
 798   // they're obsolete. This lower bound is used internally to prevent newly
 799   // created flush/compaction output files from being deleted before they're
 800   // installed. This technique avoids the need for tracking the exact numbers of
 801   // files pending creation, although it prevents more files than necessary from
 802   // being deleted.
 803   uint64_t MinObsoleteSstNumberToKeep();
 804
 805   // Returns the list of live files in 'live' and the list
 806   // of all files in the filesystem in 'candidate_files'.
 807   // If force == false and the last call was less than
 808   // db_options_.delete_obsolete_files_period_micros microseconds ago,
 809   // it will not fill up the job_context
 810   void FindObsoleteFiles(JobContext* job_context, bool force,
 811                          bool no_full_scan = false);
 812
 813   // Diffs the files listed in filenames and those that do not
 814   // belong to live files are possibly removed. Also, removes all the
 815   // files in sst_delete_files and log_delete_files.
 816   // It is not necessary to hold the mutex when invoking this method.
 817   // If FindObsoleteFiles() was run, we need to also run
 818   // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
 819   void PurgeObsoleteFiles(JobContext& background_contet,
 820                           bool schedule_only = false);
 821
 822   // Schedule a background job to actually delete obsolete files.
 823   void SchedulePurge();
 824
 825   const SnapshotList& snapshots() const { return snapshots_; }
 826
 827   // load list of snapshots to `snap_vector` that is no newer than `max_seq`
 828   // in ascending order.
 829   // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
 830   // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
 831   void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
 832                      SequenceNumber* oldest_write_conflict_snapshot,
 833                      const SequenceNumber& max_seq) const {
 834     InstrumentedMutexLock l(mutex());
 835     snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
 836   }
 837
 838   const ImmutableDBOptions& immutable_db_options() const {
 839     return immutable_db_options_;
 840   }
 841
 842   // Cancel all background jobs, including flush, compaction, background
 843   // purging, stats dumping threads, etc. If `wait` = true, wait for the
 844   // running jobs to abort or finish before returning. Otherwise, only
 845   // sends the signals.
 846   void CancelAllBackgroundWork(bool wait);
 847
 848   // Find Super version and reference it. Based on options, it might return
 849   // the thread local cached one.
 850   // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
 851   SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
 852
 853   // Similar to the previous function but looks up based on a column family id.
 854   // nullptr will be returned if this column family no longer exists.
 855   // REQUIRED: this function should only be called on the write thread or if the
 856   // mutex is held.
 857   SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
 858
 859   // Un-reference the super version and clean it up if it is the last reference.
 860   void CleanupSuperVersion(SuperVersion* sv);
 861
 862   // Un-reference the super version and return it to thread local cache if
 863   // needed. If it is the last reference of the super version. Clean it up
 864   // after un-referencing it.
 865   void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
 866
 867   // Similar to the previous function but looks up based on a column family id.
 868   // nullptr will be returned if this column family no longer exists.
 869   // REQUIRED: this function should only be called on the write thread.
 870   void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
 871
 872   // REQUIRED: this function should only be called on the write thread or if the
 873   // mutex is held.  Return value only valid until next call to this function or
 874   // mutex is released.
 875   ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
 876
 877   // Same as above, should called without mutex held and not on write thread.
 878   std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
 879       uint32_t column_family_id);
 880
 881   // Returns the number of currently running flushes.
 882   // REQUIREMENT: mutex_ must be held when calling this function.
 883   int num_running_flushes() {
 884     mutex_.AssertHeld();
 885     return num_running_flushes_;
 886   }
 887
 888   // Returns the number of currently running compactions.
 889   // REQUIREMENT: mutex_ must be held when calling this function.
 890   int num_running_compactions() {
 891     mutex_.AssertHeld();
 892     return num_running_compactions_;
 893   }
 894
 895   const WriteController& write_controller() { return write_controller_; }
 896
 897   // hollow transactions shell used for recovery.
 898   // these will then be passed to TransactionDB so that
 899   // locks can be reacquired before writing can resume.
 900   struct RecoveredTransaction {
 901     std::string name_;
 902     bool unprepared_;
 903
 904     struct BatchInfo {
 905       uint64_t log_number_;
 906       // TODO(lth): For unprepared, the memory usage here can be big for
 907       // unprepared transactions. This is only useful for rollbacks, and we
 908       // can in theory just keep keyset for that.
 909       WriteBatch* batch_;
 910       // Number of sub-batches. A new sub-batch is created if txn attempts to
 911       // insert a duplicate key,seq to memtable. This is currently used in
 912       // WritePreparedTxn/WriteUnpreparedTxn.
 913       size_t batch_cnt_;
 914     };
 915
 916     // This maps the seq of the first key in the batch to BatchInfo, which
 917     // contains WriteBatch and other information relevant to the batch.
 918     //
 919     // For WriteUnprepared, batches_ can have size greater than 1, but for
 920     // other write policies, it must be of size 1.
 921     std::map<SequenceNumber, BatchInfo> batches_;
 922
 923     explicit RecoveredTransaction(const uint64_t log, const std::string& name,
 924                                   WriteBatch* batch, SequenceNumber seq,
 925                                   size_t batch_cnt, bool unprepared)
 926         : name_(name), unprepared_(unprepared) {
 927       batches_[seq] = {log, batch, batch_cnt};
 928     }
 929
 930     ~RecoveredTransaction() {
 931       for (auto& it : batches_) {
 932         delete it.second.batch_;
 933       }
 934     }
 935
 936     void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
 937                   size_t batch_cnt, bool unprepared) {
 938       assert(batches_.count(seq) == 0);
 939       batches_[seq] = {log_number, batch, batch_cnt};
 940       // Prior state must be unprepared, since the prepare batch must be the
 941       // last batch.
 942       assert(unprepared_);
 943       unprepared_ = unprepared;
 944     }
 945   };
 946
 947   bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
 948
 949   std::unordered_map<std::string, RecoveredTransaction*>
 950   recovered_transactions() {
 951     return recovered_transactions_;
 952   }
 953
 954   RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
 955     auto it = recovered_transactions_.find(name);
 956     if (it == recovered_transactions_.end()) {
 957       return nullptr;
 958     } else {
 959       return it->second;
 960     }
 961   }
 962
 963   void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
 964                                   WriteBatch* batch, SequenceNumber seq,
 965                                   size_t batch_cnt, bool unprepared_batch) {
 966     // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
 967     // times for every unprepared batch encountered during recovery.
 968     //
 969     // If the transaction is prepared, then the last call to
 970     // InsertRecoveredTransaction will have unprepared_batch = false.
 971     auto rtxn = recovered_transactions_.find(name);
 972     if (rtxn == recovered_transactions_.end()) {
 973       recovered_transactions_[name] = new RecoveredTransaction(
 974           log, name, batch, seq, batch_cnt, unprepared_batch);
 975     } else {
 976       rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
 977     }
 978     logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
 979   }
 980
 981   void DeleteRecoveredTransaction(const std::string& name) {
 982     auto it = recovered_transactions_.find(name);
 983     assert(it != recovered_transactions_.end());
 984     auto* trx = it->second;
 985     recovered_transactions_.erase(it);
 986     for (const auto& info : trx->batches_) {
 987       logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
 988           info.second.log_number_);
 989     }
 990     delete trx;
 991   }
 992
 993   void DeleteAllRecoveredTransactions() {
 994     for (auto it = recovered_transactions_.begin();
 995          it != recovered_transactions_.end(); ++it) {
 996       delete it->second;
 997     }
 998     recovered_transactions_.clear();
 999   }
1000
1001   void AddToLogsToFreeQueue(log::Writer* log_writer) {
1002     mutex_.AssertHeld();
1003     logs_to_free_queue_.push_back(log_writer);
1004   }
1005
1006   void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
1007     superversions_to_free_queue_.push_back(sv);
1008   }
1009
1010   void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
1011
1012   // Fill JobContext with snapshot information needed by flush and compaction.
1013   void GetSnapshotContext(JobContext* job_context,
1014                           std::vector<SequenceNumber>* snapshot_seqs,
1015                           SequenceNumber* earliest_write_conflict_snapshot,
1016                           SnapshotChecker** snapshot_checker);
1017
1018   // Not thread-safe.
1019   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
1020
1021   InstrumentedMutex* mutex() const { return &mutex_; }
1022
1023   // Initialize a brand new DB. The DB directory is expected to be empty before
1024   // calling it. Push new manifest file name into `new_filenames`.
1025   Status NewDB(std::vector<std::string>* new_filenames);
1026
1027   // This is to be used only by internal rocksdb classes.
1028   static Status Open(const DBOptions& db_options, const std::string& name,
1029                      const std::vector<ColumnFamilyDescriptor>& column_families,
1030                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
1031                      const bool seq_per_batch, const bool batch_per_txn);
1032
1033   static IOStatus CreateAndNewDirectory(
1034       FileSystem* fs, const std::string& dirname,
1035       std::unique_ptr<FSDirectory>* directory);
1036
1037   // find stats map from stats_history_ with smallest timestamp in
1038   // the range of [start_time, end_time)
1039   bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
1040                        uint64_t* new_time,
1041                        std::map<std::string, uint64_t>* stats_map);
1042
1043   // Print information of all tombstones of all iterators to the std::string
1044   // This is only used by ldb. The output might be capped. Tombstones
1045   // printed out are not guaranteed to be in any order.
1046   Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
1047                                      int max_entries_to_print,
1048                                      std::string* out_str);
1049
1050   VersionSet* GetVersionSet() const { return versions_.get(); }
1051
1052   // Wait for any compaction
1053   // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
1054   // is only for the special test of CancelledCompactions
1055   Status WaitForCompact(bool waitUnscheduled = false);
1056
1057 #ifndef NDEBUG
1058   // Compact any files in the named level that overlap [*begin, *end]
1059   Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
1060                            ColumnFamilyHandle* column_family = nullptr,
1061                            bool disallow_trivial_move = false);
1062
1063   Status TEST_SwitchWAL();
1064
1065   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
1066
1067   bool TEST_IsLogGettingFlushed() {
1068     return alive_log_files_.begin()->getting_flushed;
1069   }
1070
1071   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
1072
1073   // Force current memtable contents to be flushed.
1074   Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
1075                             ColumnFamilyHandle* cfh = nullptr);
1076
1077   Status TEST_FlushMemTable(ColumnFamilyData* cfd,
1078                             const FlushOptions& flush_opts);
1079
1080   // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
1081   // is because in certain cases, we can flush column families, wait for the
1082   // flush to complete, but delete the column family handle before the wait
1083   // finishes. For example in CompactRange.
1084   Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
1085                                    const FlushOptions& flush_opts);
1086
1087   // Wait for background threads to complete scheduled work.
1088   Status TEST_WaitForBackgroundWork();
1089
1090   // Wait for memtable compaction
1091   Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
1092
1093   // Wait for any compaction
1094   // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
1095   // is only for the special test of CancelledCompactions
1096   Status TEST_WaitForCompact(bool waitUnscheduled = false);
1097
1098   // Wait for any background purge
1099   Status TEST_WaitForPurge();
1100
1101   // Get the background error status
1102   Status TEST_GetBGError();
1103
1104   // Return the maximum overlapping data (in bytes) at next level for any
1105   // file at a level >= 1.
1106   uint64_t TEST_MaxNextLevelOverlappingBytes(
1107       ColumnFamilyHandle* column_family = nullptr);
1108
1109   // Return the current manifest file no.
1110   uint64_t TEST_Current_Manifest_FileNo();
1111
1112   // Returns the number that'll be assigned to the next file that's created.
1113   uint64_t TEST_Current_Next_FileNo();
1114
1115   // get total level0 file size. Only for testing.
1116   uint64_t TEST_GetLevel0TotalSize();
1117
1118   void TEST_GetFilesMetaData(
1119       ColumnFamilyHandle* column_family,
1120       std::vector<std::vector<FileMetaData>>* metadata,
1121       std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
1122
1123   void TEST_LockMutex();
1124
1125   void TEST_UnlockMutex();
1126
1127   // REQUIRES: mutex locked
1128   void* TEST_BeginWrite();
1129
1130   // REQUIRES: mutex locked
1131   // pass the pointer that you got from TEST_BeginWrite()
1132   void TEST_EndWrite(void* w);
1133
1134   uint64_t TEST_MaxTotalInMemoryState() const {
1135     return max_total_in_memory_state_;
1136   }
1137
1138   size_t TEST_LogsToFreeSize();
1139
1140   uint64_t TEST_LogfileNumber();
1141
1142   uint64_t TEST_total_log_size() const { return total_log_size_; }
1143
1144   // Returns column family name to ImmutableCFOptions map.
1145   Status TEST_GetAllImmutableCFOptions(
1146       std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
1147
1148   // Return the lastest MutableCFOptions of a column family
1149   Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
1150                                         MutableCFOptions* mutable_cf_options);
1151
1152   Cache* TEST_table_cache() { return table_cache_.get(); }
1153
1154   WriteController& TEST_write_controler() { return write_controller_; }
1155
1156   uint64_t TEST_FindMinLogContainingOutstandingPrep();
1157   uint64_t TEST_FindMinPrepLogReferencedByMemTable();
1158   size_t TEST_PreparedSectionCompletedSize();
1159   size_t TEST_LogsWithPrepSize();
1160
1161   int TEST_BGCompactionsAllowed() const;
1162   int TEST_BGFlushesAllowed() const;
1163   size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
1164   void TEST_WaitForPeridicTaskRun(std::function<void()> callback) const;
1165   SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
1166   size_t TEST_EstimateInMemoryStatsHistorySize() const;
1167
1168   uint64_t TEST_GetCurrentLogNumber() const {
1169     InstrumentedMutexLock l(mutex());
1170     assert(!logs_.empty());
1171     return logs_.back().number;
1172   }
1173
1174   const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
1175     return files_grabbed_for_purge_;
1176   }
1177
1178 #ifndef ROCKSDB_LITE
1179   const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const;
1180 #endif  // !ROCKSDB_LITE
1181
1182 #endif  // NDEBUG
1183
1184   // persist stats to column family "_persistent_stats"
1185   void PersistStats();
1186
1187   // dump rocksdb.stats to LOG
1188   void DumpStats();
1189
1190   // flush LOG out of application buffer
1191   void FlushInfoLog();
1192
1193   // record current sequence number to time mapping
1194   void RecordSeqnoToTimeMapping();
1195
1196   // Interface to block and signal the DB in case of stalling writes by
1197   // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
1198   // When DB needs to be blocked or signalled by WriteBufferManager,
1199   // state_ is changed accordingly.
1200   class WBMStallInterface : public StallInterface {
1201    public:
1202     enum State {
1203       BLOCKED = 0,
1204       RUNNING,
1205     };
1206
1207     WBMStallInterface() : state_cv_(&state_mutex_) {
1208       MutexLock lock(&state_mutex_);
1209       state_ = State::RUNNING;
1210     }
1211
1212     void SetState(State state) {
1213       MutexLock lock(&state_mutex_);
1214       state_ = state;
1215     }
1216
1217     // Change the state_ to State::BLOCKED and wait until its state is
1218     // changed by WriteBufferManager. When stall is cleared, Signal() is
1219     // called to change the state and unblock the DB.
1220     void Block() override {
1221       MutexLock lock(&state_mutex_);
1222       while (state_ == State::BLOCKED) {
1223         TEST_SYNC_POINT("WBMStallInterface::BlockDB");
1224         state_cv_.Wait();
1225       }
1226     }
1227
1228     // Called from WriteBufferManager. This function changes the state_
1229     // to State::RUNNING indicating the stall is cleared and DB can proceed.
1230     void Signal() override {
1231       {
1232         MutexLock lock(&state_mutex_);
1233         state_ = State::RUNNING;
1234       }
1235       state_cv_.Signal();
1236     }
1237
1238    private:
1239     // Conditional variable and mutex to block and
1240     // signal the DB during stalling process.
1241     port::Mutex state_mutex_;
1242     port::CondVar state_cv_;
1243     // state represting whether DB is running or blocked because of stall by
1244     // WriteBufferManager.
1245     State state_;
1246   };
1247
1248   static void TEST_ResetDbSessionIdGen();
1249   static std::string GenerateDbSessionId(Env* env);
1250
1251   bool seq_per_batch() const { return seq_per_batch_; }
1252
1253  protected:
1254   const std::string dbname_;
1255   // TODO(peterd): unify with VersionSet::db_id_
1256   std::string db_id_;
1257   // db_session_id_ is an identifier that gets reset
1258   // every time the DB is opened
1259   std::string db_session_id_;
1260   std::unique_ptr<VersionSet> versions_;
1261   // Flag to check whether we allocated and own the info log file
1262   bool own_info_log_;
1263   Status init_logger_creation_s_;
1264   const DBOptions initial_db_options_;
1265   Env* const env_;
1266   std::shared_ptr<IOTracer> io_tracer_;
1267   const ImmutableDBOptions immutable_db_options_;
1268   FileSystemPtr fs_;
1269   MutableDBOptions mutable_db_options_;
1270   Statistics* stats_;
1271   std::unordered_map<std::string, RecoveredTransaction*>
1272       recovered_transactions_;
1273   std::unique_ptr<Tracer> tracer_;
1274   InstrumentedMutex trace_mutex_;
1275   BlockCacheTracer block_cache_tracer_;
1276
1277   // constant false canceled flag, used when the compaction is not manual
1278   const std::atomic<bool> kManualCompactionCanceledFalse_{false};
1279
1280   // State below is protected by mutex_
1281   // With two_write_queues enabled, some of the variables that accessed during
1282   // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
1283   // logs_, logfile_number_. Refer to the definition of each variable below for
1284   // more description.
1285   //
1286   // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
1287   // cachelines.
1288   mutable CacheAlignedInstrumentedMutex mutex_;
1289
1290   ColumnFamilyHandleImpl* default_cf_handle_;
1291   InternalStats* default_cf_internal_stats_;
1292
1293   // table_cache_ provides its own synchronization
1294   std::shared_ptr<Cache> table_cache_;
1295
1296   ErrorHandler error_handler_;
1297
1298   // Unified interface for logging events
1299   EventLogger event_logger_;
1300
1301   // only used for dynamically adjusting max_total_wal_size. it is a sum of
1302   // [write_buffer_size * max_write_buffer_number] over all column families
1303   std::atomic<uint64_t> max_total_in_memory_state_;
1304
1305   // The options to access storage files
1306   const FileOptions file_options_;
1307
1308   // Additonal options for compaction and flush
1309   FileOptions file_options_for_compaction_;
1310
1311   std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
1312
1313   // Increase the sequence number after writing each batch, whether memtable is
1314   // disabled for that or not. Otherwise the sequence number is increased after
1315   // writing each key into memtable. This implies that when disable_memtable is
1316   // set, the seq is not increased at all.
1317   //
1318   // Default: false
1319   const bool seq_per_batch_;
1320   // This determines during recovery whether we expect one writebatch per
1321   // recovered transaction, or potentially multiple writebatches per
1322   // transaction. For WriteUnprepared, this is set to false, since multiple
1323   // batches can exist per transaction.
1324   //
1325   // Default: true
1326   const bool batch_per_txn_;
1327
1328   // Each flush or compaction gets its own job id. this counter makes sure
1329   // they're unique
1330   std::atomic<int> next_job_id_;
1331
1332   std::atomic<bool> shutting_down_;
1333
1334   // RecoveryContext struct stores the context about version edits along
1335   // with corresponding column_family_data and column_family_options.
1336   class RecoveryContext {
1337    public:
1338     ~RecoveryContext() {
1339       for (auto& edit_list : edit_lists_) {
1340         for (auto* edit : edit_list) {
1341           delete edit;
1342         }
1343       }
1344     }
1345
1346     void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
1347       assert(cfd != nullptr);
1348       if (map_.find(cfd->GetID()) == map_.end()) {
1349         uint32_t size = static_cast<uint32_t>(map_.size());
1350         map_.emplace(cfd->GetID(), size);
1351         cfds_.emplace_back(cfd);
1352         mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
1353         edit_lists_.emplace_back(autovector<VersionEdit*>());
1354       }
1355       uint32_t i = map_[cfd->GetID()];
1356       edit_lists_[i].emplace_back(new VersionEdit(edit));
1357     }
1358
1359     std::unordered_map<uint32_t, uint32_t> map_;  // cf_id to index;
1360     autovector<ColumnFamilyData*> cfds_;
1361     autovector<const MutableCFOptions*> mutable_cf_opts_;
1362     autovector<autovector<VersionEdit*>> edit_lists_;
1363     // files_to_delete_ contains sst files
1364     std::unordered_set<std::string> files_to_delete_;
1365   };
1366
1367   // Except in DB::Open(), WriteOptionsFile can only be called when:
1368   // Persist options to options file.
1369   // If need_mutex_lock = false, the method will lock DB mutex.
1370   // If need_enter_write_thread = false, the method will enter write thread.
1371   Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
1372
1373   Status CompactRangeInternal(const CompactRangeOptions& options,
1374                               ColumnFamilyHandle* column_family,
1375                               const Slice* begin, const Slice* end,
1376                               const std::string& trim_ts);
1377
1378   // The following two functions can only be called when:
1379   // 1. WriteThread::Writer::EnterUnbatched() is used.
1380   // 2. db_mutex is NOT held
1381   Status RenameTempFileToOptionsFile(const std::string& file_name);
1382   Status DeleteObsoleteOptionsFiles();
1383
1384   void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
1385                           const MutableCFOptions& mutable_cf_options,
1386                           int job_id);
1387
1388   void NotifyOnFlushCompleted(
1389       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
1390       std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
1391
1392   void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
1393                                const Status& st,
1394                                const CompactionJobStats& job_stats, int job_id);
1395
1396   void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
1397                                    const Status& st,
1398                                    const CompactionJobStats& job_stats,
1399                                    int job_id);
1400   void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
1401                               const MemTableInfo& mem_table_info);
1402
1403 #ifndef ROCKSDB_LITE
1404   void NotifyOnExternalFileIngested(
1405       ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
1406
1407   virtual Status FlushForGetLiveFiles();
1408 #endif  // !ROCKSDB_LITE
1409
1410   void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
1411
1412   void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
1413
1414   void EraseThreadStatusDbInfo() const;
1415
1416   // If disable_memtable is set the application logic must guarantee that the
1417   // batch will still be skipped from memtable during the recovery. An excption
1418   // to this is seq_per_batch_ mode, in which since each batch already takes one
1419   // seq, it is ok for the batch to write to memtable during recovery as long as
1420   // it only takes one sequence number: i.e., no duplicate keys.
1421   // In WriteCommitted it is guarnateed since disable_memtable is used for
1422   // prepare batch which will be written to memtable later during the commit,
1423   // and in WritePrepared it is guaranteed since it will be used only for WAL
1424   // markers which will never be written to memtable. If the commit marker is
1425   // accompanied with CommitTimeWriteBatch that is not written to memtable as
1426   // long as it has no duplicate keys, it does not violate the one-seq-per-batch
1427   // policy.
1428   // batch_cnt is expected to be non-zero in seq_per_batch mode and
1429   // indicates the number of sub-patches. A sub-patch is a subset of the write
1430   // batch that does not have duplicate keys.
1431   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
1432                    WriteCallback* callback = nullptr,
1433                    uint64_t* log_used = nullptr, uint64_t log_ref = 0,
1434                    bool disable_memtable = false, uint64_t* seq_used = nullptr,
1435                    size_t batch_cnt = 0,
1436                    PreReleaseCallback* pre_release_callback = nullptr,
1437                    PostMemTableCallback* post_memtable_callback = nullptr);
1438
1439   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
1440                             WriteCallback* callback = nullptr,
1441                             uint64_t* log_used = nullptr, uint64_t log_ref = 0,
1442                             bool disable_memtable = false,
1443                             uint64_t* seq_used = nullptr);
1444
1445   // Write only to memtables without joining any write queue
1446   Status UnorderedWriteMemtable(const WriteOptions& write_options,
1447                                 WriteBatch* my_batch, WriteCallback* callback,
1448                                 uint64_t log_ref, SequenceNumber seq,
1449                                 const size_t sub_batch_cnt);
1450
1451   // Whether the batch requires to be assigned with an order
1452   enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
1453   // Whether it requires publishing last sequence or not
1454   enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
1455
1456   // Join the write_thread to write the batch only to the WAL. It is the
1457   // responsibility of the caller to also write the write batch to the memtable
1458   // if it required.
1459   //
1460   // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
1461   // indicating the number of sub-batches in my_batch. A sub-patch is a subset
1462   // of the write batch that does not have duplicate keys. When seq_per_batch is
1463   // not set, each key is a separate sub_batch. Otherwise each duplicate key
1464   // marks start of a new sub-batch.
1465   Status WriteImplWALOnly(
1466       WriteThread* write_thread, const WriteOptions& options,
1467       WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
1468       const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
1469       PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
1470       const PublishLastSeq publish_last_seq, const bool disable_memtable);
1471
1472   // write cached_recoverable_state_ to memtable if it is not empty
1473   // The writer must be the leader in write_thread_ and holding mutex_
1474   Status WriteRecoverableState();
1475
1476   // Actual implementation of Close()
1477   Status CloseImpl();
1478
1479   // Recover the descriptor from persistent storage.  May do a significant
1480   // amount of work to recover recently logged updates.  Any changes to
1481   // be made to the descriptor are added to *edit.
1482   // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
1483   // skipped.
1484   // recovery_ctx stores the context about version edits and all those
1485   // edits are persisted to new Manifest after successfully syncing the new WAL.
1486   virtual Status Recover(
1487       const std::vector<ColumnFamilyDescriptor>& column_families,
1488       bool read_only = false, bool error_if_wal_file_exists = false,
1489       bool error_if_data_exists_in_wals = false,
1490       uint64_t* recovered_seq = nullptr,
1491       RecoveryContext* recovery_ctx = nullptr);
1492
1493   virtual bool OwnTablesAndLogs() const { return true; }
1494
1495   // Setup DB identity file, and write DB ID to manifest if necessary.
1496   Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx);
1497   // Assign db_id_ and write DB ID to manifest if necessary.
1498   void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
1499
1500   // REQUIRES: db mutex held when calling this function, but the db mutex can
1501   // be released and re-acquired. Db mutex will be held when the function
1502   // returns.
1503   // After recovery, there may be SST files in db/cf paths that are
1504   // not referenced in the MANIFEST (e.g.
1505   // 1. It's best effort recovery;
1506   // 2. The VersionEdits referencing the SST files are appended to
1507   // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
1508   // still not synced to MANIFEST during recovery.)
1509   // It stores the SST files to be deleted in RecoveryContext. In the
1510   // meantime, we find out the largest file number present in the paths, and
1511   // bump up the version set's next_file_number_ to be 1 + largest_file_number.
1512   // recovery_ctx stores the context about version edits and files to be
1513   // deleted. All those edits are persisted to new Manifest after successfully
1514   // syncing the new WAL.
1515   Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
1516
1517   // SetDbSessionId() should be called in the constuctor DBImpl()
1518   // to ensure that db_session_id_ gets updated every time the DB is opened
1519   void SetDbSessionId();
1520
1521   Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
1522   Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts,
1523                             bool ts_for_read) const;
1524
1525   // recovery_ctx stores the context about version edits and
1526   // LogAndApplyForRecovery persist all those edits to new Manifest after
1527   // successfully syncing new WAL.
1528   // LogAndApplyForRecovery should be called only once during recovery and it
1529   // should be called when RocksDB writes to a first new MANIFEST since this
1530   // recovery.
1531   Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
1532
1533   void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
1534
1535   // Return true to proceed with current WAL record whose content is stored in
1536   // `batch`. Return false to skip current WAL record.
1537   bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
1538                                           const std::string& wal_fname,
1539                                           log::Reader::Reporter& reporter,
1540                                           Status& status, bool& stop_replay,
1541                                           WriteBatch& batch);
1542
1543  private:
1544   friend class DB;
1545   friend class ErrorHandler;
1546   friend class InternalStats;
1547   friend class PessimisticTransaction;
1548   friend class TransactionBaseImpl;
1549   friend class WriteCommittedTxn;
1550   friend class WritePreparedTxn;
1551   friend class WritePreparedTxnDB;
1552   friend class WriteBatchWithIndex;
1553   friend class WriteUnpreparedTxnDB;
1554   friend class WriteUnpreparedTxn;
1555
1556 #ifndef ROCKSDB_LITE
1557   friend class ForwardIterator;
1558 #endif
1559   friend struct SuperVersion;
1560   friend class CompactedDBImpl;
1561   friend class DBTest_ConcurrentFlushWAL_Test;
1562   friend class DBTest_MixedSlowdownOptionsStop_Test;
1563   friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
1564   friend class DBCompactionTest_CompactionDuringShutdown_Test;
1565   friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
1566 #ifndef NDEBUG
1567   friend class DBTest2_ReadCallbackTest_Test;
1568   friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
1569   friend class XFTransactionWriteHandler;
1570   friend class DBBlobIndexTest;
1571   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
1572 #endif
1573
1574   struct CompactionState;
1575   struct PrepickedCompaction;
1576   struct PurgeFileInfo;
1577
1578   struct WriteContext {
1579     SuperVersionContext superversion_context;
1580     autovector<MemTable*> memtables_to_free_;
1581
1582     explicit WriteContext(bool create_superversion = false)
1583         : superversion_context(create_superversion) {}
1584
1585     ~WriteContext() {
1586       superversion_context.Clean();
1587       for (auto& m : memtables_to_free_) {
1588         delete m;
1589       }
1590     }
1591   };
1592
1593   struct LogFileNumberSize {
1594     explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
1595     LogFileNumberSize() {}
1596     void AddSize(uint64_t new_size) { size += new_size; }
1597     uint64_t number;
1598     uint64_t size = 0;
1599     bool getting_flushed = false;
1600   };
1601
1602   struct LogWriterNumber {
1603     // pass ownership of _writer
1604     LogWriterNumber(uint64_t _number, log::Writer* _writer)
1605         : number(_number), writer(_writer) {}
1606
1607     log::Writer* ReleaseWriter() {
1608       auto* w = writer;
1609       writer = nullptr;
1610       return w;
1611     }
1612     Status ClearWriter() {
1613       Status s = writer->WriteBuffer();
1614       delete writer;
1615       writer = nullptr;
1616       return s;
1617     }
1618
1619     bool IsSyncing() { return getting_synced; }
1620
1621     uint64_t GetPreSyncSize() {
1622       assert(getting_synced);
1623       return pre_sync_size;
1624     }
1625
1626     void PrepareForSync() {
1627       assert(!getting_synced);
1628       // Size is expected to be monotonically increasing.
1629       assert(writer->file()->GetFlushedSize() >= pre_sync_size);
1630       getting_synced = true;
1631       pre_sync_size = writer->file()->GetFlushedSize();
1632     }
1633
1634     void FinishSync() {
1635       assert(getting_synced);
1636       getting_synced = false;
1637     }
1638
1639     uint64_t number;
1640     // Visual Studio doesn't support deque's member to be noncopyable because
1641     // of a std::unique_ptr as a member.
1642     log::Writer* writer;  // own
1643
1644    private:
1645     // true for some prefix of logs_
1646     bool getting_synced = false;
1647     // The size of the file before the sync happens. This amount is guaranteed
1648     // to be persisted even if appends happen during sync so it can be used for
1649     // tracking the synced size in MANIFEST.
1650     uint64_t pre_sync_size = 0;
1651   };
1652
1653   struct LogContext {
1654     explicit LogContext(bool need_sync = false)
1655         : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
1656     bool need_log_sync = false;
1657     bool need_log_dir_sync = false;
1658     log::Writer* writer = nullptr;
1659     LogFileNumberSize* log_file_number_size = nullptr;
1660   };
1661
1662   // PurgeFileInfo is a structure to hold information of files to be deleted in
1663   // purge_files_
1664   struct PurgeFileInfo {
1665     std::string fname;
1666     std::string dir_to_sync;
1667     FileType type;
1668     uint64_t number;
1669     int job_id;
1670     PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
1671                   int jid)
1672         : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
1673   };
1674
1675   // Argument required by background flush thread.
1676   struct BGFlushArg {
1677     BGFlushArg()
1678         : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
1679     BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
1680                SuperVersionContext* superversion_context)
1681         : cfd_(cfd),
1682           max_memtable_id_(max_memtable_id),
1683           superversion_context_(superversion_context) {}
1684
1685     // Column family to flush.
1686     ColumnFamilyData* cfd_;
1687     // Maximum ID of memtable to flush. In this column family, memtables with
1688     // IDs smaller than this value must be flushed before this flush completes.
1689     uint64_t max_memtable_id_;
1690     // Pointer to a SuperVersionContext object. After flush completes, RocksDB
1691     // installs a new superversion for the column family. This operation
1692     // requires a SuperVersionContext object (currently embedded in JobContext).
1693     SuperVersionContext* superversion_context_;
1694   };
1695
1696   // Argument passed to flush thread.
1697   struct FlushThreadArg {
1698     DBImpl* db_;
1699
1700     Env::Priority thread_pri_;
1701   };
1702
1703   // Information for a manual compaction
1704   struct ManualCompactionState {
1705     ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
1706                           int _output_level, uint32_t _output_path_id,
1707                           bool _exclusive, bool _disallow_trivial_move,
1708                           std::atomic<bool>* _canceled)
1709         : cfd(_cfd),
1710           input_level(_input_level),
1711           output_level(_output_level),
1712           output_path_id(_output_path_id),
1713           exclusive(_exclusive),
1714           disallow_trivial_move(_disallow_trivial_move),
1715           canceled(_canceled ? *_canceled : canceled_internal_storage) {}
1716     // When _canceled is not provided by ther user, we assign the reference of
1717     // canceled_internal_storage to it to consolidate canceled and
1718     // manual_compaction_paused since DisableManualCompaction() might be
1719     // called
1720
1721     ColumnFamilyData* cfd;
1722     int input_level;
1723     int output_level;
1724     uint32_t output_path_id;
1725     Status status;
1726     bool done = false;
1727     bool in_progress = false;    // compaction request being processed?
1728     bool incomplete = false;     // only part of requested range compacted
1729     bool exclusive;              // current behavior of only one manual
1730     bool disallow_trivial_move;  // Force actual compaction to run
1731     const InternalKey* begin = nullptr;  // nullptr means beginning of key range
1732     const InternalKey* end = nullptr;    // nullptr means end of key range
1733     InternalKey* manual_end = nullptr;   // how far we are compacting
1734     InternalKey tmp_storage;      // Used to keep track of compaction progress
1735     InternalKey tmp_storage1;     // Used to keep track of compaction progress
1736
1737     // When the user provides a canceled pointer in CompactRangeOptions, the
1738     // above varaibe is the reference of the user-provided
1739     // `canceled`, otherwise, it is the reference of canceled_internal_storage
1740     std::atomic<bool> canceled_internal_storage = false;
1741     std::atomic<bool>& canceled;  // Compaction canceled pointer reference
1742   };
1743   struct PrepickedCompaction {
1744     // background compaction takes ownership of `compaction`.
1745     Compaction* compaction;
1746     // caller retains ownership of `manual_compaction_state` as it is reused
1747     // across background compactions.
1748     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
1749     // task limiter token is requested during compaction picking.
1750     std::unique_ptr<TaskLimiterToken> task_token;
1751   };
1752
1753   struct CompactionArg {
1754     // caller retains ownership of `db`.
1755     DBImpl* db;
1756     // background compaction takes ownership of `prepicked_compaction`.
1757     PrepickedCompaction* prepicked_compaction;
1758     Env::Priority compaction_pri_;
1759   };
1760
1761   // Initialize the built-in column family for persistent stats. Depending on
1762   // whether on-disk persistent stats have been enabled before, it may either
1763   // create a new column family and column family handle or just a column family
1764   // handle.
1765   // Required: DB mutex held
1766   Status InitPersistStatsColumnFamily();
1767
1768   // Persistent Stats column family has two format version key which are used
1769   // for compatibility check. Write format version if it's created for the
1770   // first time, read format version and check compatibility if recovering
1771   // from disk. This function requires DB mutex held at entrance but may
1772   // release and re-acquire DB mutex in the process.
1773   // Required: DB mutex held
1774   Status PersistentStatsProcessFormatVersion();
1775
1776   Status ResumeImpl(DBRecoverContext context);
1777
1778   void MaybeIgnoreError(Status* s) const;
1779
1780   const Status CreateArchivalDirectory();
1781
1782   Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
1783                                 const std::string& cf_name,
1784                                 ColumnFamilyHandle** handle);
1785
1786   Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
1787
1788   // Delete any unneeded files and stale in-memory entries.
1789   void DeleteObsoleteFiles();
1790   // Delete obsolete files and log status and information of file deletion
1791   void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
1792                               const std::string& path_to_sync, FileType type,
1793                               uint64_t number);
1794
1795   // Background process needs to call
1796   //     auto x = CaptureCurrentFileNumberInPendingOutputs()
1797   //     auto file_num = versions_->NewFileNumber();
1798   //     <do something>
1799   //     ReleaseFileNumberFromPendingOutputs(x)
1800   // This will protect any file with number `file_num` or greater from being
1801   // deleted while <do something> is running.
1802   // -----------
1803   // This function will capture current file number and append it to
1804   // pending_outputs_. This will prevent any background process to delete any
1805   // file created after this point.
1806   std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
1807   // This function should be called with the result of
1808   // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
1809   // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
1810   // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
1811   // and blocked by any other pending_outputs_ calls)
1812   void ReleaseFileNumberFromPendingOutputs(
1813       std::unique_ptr<std::list<uint64_t>::iterator>& v);
1814
1815   IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
1816
1817   // Flush the in-memory write buffer to storage.  Switches to a new
1818   // log-file/memtable and writes a new descriptor iff successful. Then
1819   // installs a new super version for the column family.
1820   Status FlushMemTableToOutputFile(
1821       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
1822       bool* madeProgress, JobContext* job_context,
1823       SuperVersionContext* superversion_context,
1824       std::vector<SequenceNumber>& snapshot_seqs,
1825       SequenceNumber earliest_write_conflict_snapshot,
1826       SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
1827       Env::Priority thread_pri);
1828
1829   // Flush the memtables of (multiple) column families to multiple files on
1830   // persistent storage.
1831   Status FlushMemTablesToOutputFiles(
1832       const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
1833       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
1834
1835   Status AtomicFlushMemTablesToOutputFiles(
1836       const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
1837       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
1838
1839   // REQUIRES: log_numbers are sorted in ascending order
1840   // corrupted_log_found is set to true if we recover from a corrupted log file.
1841   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
1842                          SequenceNumber* next_sequence, bool read_only,
1843                          bool* corrupted_log_found,
1844                          RecoveryContext* recovery_ctx);
1845
1846   // The following two methods are used to flush a memtable to
1847   // storage. The first one is used at database RecoveryTime (when the
1848   // database is opened) and is heavyweight because it holds the mutex
1849   // for the entire period. The second method WriteLevel0Table supports
1850   // concurrent flush memtables to storage.
1851   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1852                                      MemTable* mem, VersionEdit* edit);
1853
1854   // Get the size of a log file and, if truncate is true, truncate the
1855   // log file to its actual size, thereby freeing preallocated space.
1856   // Return success even if truncate fails
1857   Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
1858                                     LogFileNumberSize* log);
1859
1860   // Restore alive_log_files_ and total_log_size_ after recovery.
1861   // It needs to run only when there's no flush during recovery
1862   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
1863   // in case total_log_size > max_total_wal_size.
1864   Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
1865
1866   // num_bytes: for slowdown case, delay time is calculated based on
1867   //            `num_bytes` going through.
1868   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
1869
1870   // Begin stalling of writes when memory usage increases beyond a certain
1871   // threshold.
1872   void WriteBufferManagerStallWrites();
1873
1874   Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
1875                                       WriteBatch* my_batch);
1876
1877   // REQUIRES: mutex locked and in write thread.
1878   Status ScheduleFlushes(WriteContext* context);
1879
1880   void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
1881
1882   Status TrimMemtableHistory(WriteContext* context);
1883
1884   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
1885
1886   void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
1887
1888   // Force current memtable contents to be flushed.
1889   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
1890                        FlushReason flush_reason,
1891                        bool entered_write_thread = false);
1892
1893   Status AtomicFlushMemTables(
1894       const autovector<ColumnFamilyData*>& column_family_datas,
1895       const FlushOptions& options, FlushReason flush_reason,
1896       bool entered_write_thread = false);
1897
1898   // Wait until flushing this column family won't stall writes
1899   Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
1900                                            bool* flush_needed);
1901
1902   // Wait for memtable flushed.
1903   // If flush_memtable_id is non-null, wait until the memtable with the ID
1904   // gets flush. Otherwise, wait until the column family don't have any
1905   // memtable pending flush.
1906   // resuming_from_bg_err indicates whether the caller is attempting to resume
1907   // from background error.
1908   Status WaitForFlushMemTable(ColumnFamilyData* cfd,
1909                               const uint64_t* flush_memtable_id = nullptr,
1910                               bool resuming_from_bg_err = false) {
1911     return WaitForFlushMemTables({cfd}, {flush_memtable_id},
1912                                  resuming_from_bg_err);
1913   }
1914   // Wait for memtables to be flushed for multiple column families.
1915   Status WaitForFlushMemTables(
1916       const autovector<ColumnFamilyData*>& cfds,
1917       const autovector<const uint64_t*>& flush_memtable_ids,
1918       bool resuming_from_bg_err);
1919
1920   inline void WaitForPendingWrites() {
1921     mutex_.AssertHeld();
1922     TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
1923     // In case of pipelined write is enabled, wait for all pending memtable
1924     // writers.
1925     if (immutable_db_options_.enable_pipelined_write) {
1926       // Memtable writers may call DB::Get in case max_successive_merges > 0,
1927       // which may lock mutex. Unlocking mutex here to avoid deadlock.
1928       mutex_.Unlock();
1929       write_thread_.WaitForMemTableWriters();
1930       mutex_.Lock();
1931     }
1932
1933     if (!immutable_db_options_.unordered_write) {
1934       // Then the writes are finished before the next write group starts
1935       return;
1936     }
1937
1938     // Wait for the ones who already wrote to the WAL to finish their
1939     // memtable write.
1940     if (pending_memtable_writes_.load() != 0) {
1941       std::unique_lock<std::mutex> guard(switch_mutex_);
1942       switch_cv_.wait(guard,
1943                       [&] { return pending_memtable_writes_.load() == 0; });
1944     }
1945   }
1946
1947   // TaskType is used to identify tasks in thread-pool, currently only
1948   // differentiate manual compaction, which could be unscheduled from the
1949   // thread-pool.
1950   enum class TaskType : uint8_t {
1951     kDefault = 0,
1952     kManualCompaction = 1,
1953     kCount = 2,
1954   };
1955
1956   // Task tag is used to identity tasks in thread-pool, which is
1957   // dbImpl obj address + type
1958   inline void* GetTaskTag(TaskType type) {
1959     return GetTaskTag(static_cast<uint8_t>(type));
1960   }
1961
1962   inline void* GetTaskTag(uint8_t type) {
1963     return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
1964   }
1965
1966   // REQUIRES: mutex locked and in write thread.
1967   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
1968
1969   // REQUIRES: mutex locked and in write thread.
1970   Status SwitchWAL(WriteContext* write_context);
1971
1972   // REQUIRES: mutex locked and in write thread.
1973   Status HandleWriteBufferManagerFlush(WriteContext* write_context);
1974
1975   // REQUIRES: mutex locked
1976   Status PreprocessWrite(const WriteOptions& write_options,
1977                          LogContext* log_context, WriteContext* write_context);
1978
1979   // Merge write batches in the write group into merged_batch.
1980   // Returns OK if merge is successful.
1981   // Returns Corruption if corruption in write batch is detected.
1982   Status MergeBatch(const WriteThread::WriteGroup& write_group,
1983                     WriteBatch* tmp_batch, WriteBatch** merged_batch,
1984                     size_t* write_with_wal, WriteBatch** to_be_cached_state);
1985
1986   // rate_limiter_priority is used to charge `DBOptions::rate_limiter`
1987   // for automatic WAL flush (`Options::manual_wal_flush` == false)
1988   // associated with this WriteToWAL
1989   IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
1990                       uint64_t* log_used, uint64_t* log_size,
1991                       Env::IOPriority rate_limiter_priority,
1992                       LogFileNumberSize& log_file_number_size);
1993
1994   IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
1995                       log::Writer* log_writer, uint64_t* log_used,
1996                       bool need_log_sync, bool need_log_dir_sync,
1997                       SequenceNumber sequence,
1998                       LogFileNumberSize& log_file_number_size);
1999
2000   IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
2001                                 uint64_t* log_used,
2002                                 SequenceNumber* last_sequence, size_t seq_inc);
2003
2004   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
2005   // Caller must hold mutex_.
2006   void WriteStatusCheckOnLocked(const Status& status);
2007
2008   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
2009   void WriteStatusCheck(const Status& status);
2010
2011   // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
2012   // WAL, sync WAL fails, if paranoid check is enabled.
2013   void IOStatusCheck(const IOStatus& status);
2014
2015   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
2016   void MemTableInsertStatusCheck(const Status& memtable_insert_status);
2017
2018 #ifndef ROCKSDB_LITE
2019   Status CompactFilesImpl(const CompactionOptions& compact_options,
2020                           ColumnFamilyData* cfd, Version* version,
2021                           const std::vector<std::string>& input_file_names,
2022                           std::vector<std::string>* const output_file_names,
2023                           const int output_level, int output_path_id,
2024                           JobContext* job_context, LogBuffer* log_buffer,
2025                           CompactionJobInfo* compaction_job_info);
2026
2027   // Wait for current IngestExternalFile() calls to finish.
2028   // REQUIRES: mutex_ held
2029   void WaitForIngestFile();
2030 #else
2031   // IngestExternalFile is not supported in ROCKSDB_LITE so this function
2032   // will be no-op
2033   void WaitForIngestFile() {}
2034 #endif  // ROCKSDB_LITE
2035
2036   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
2037
2038   void MaybeScheduleFlushOrCompaction();
2039
2040   // A flush request specifies the column families to flush as well as the
2041   // largest memtable id to persist for each column family. Once all the
2042   // memtables whose IDs are smaller than or equal to this per-column-family
2043   // specified value, this flush request is considered to have completed its
2044   // work of flushing this column family. After completing the work for all
2045   // column families in this request, this flush is considered complete.
2046   using FlushRequest = std::vector<std::pair<ColumnFamilyData*, uint64_t>>;
2047
2048   void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
2049                             FlushRequest* req);
2050
2051   void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
2052
2053   void SchedulePendingCompaction(ColumnFamilyData* cfd);
2054   void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
2055                             FileType type, uint64_t number, int job_id);
2056   static void BGWorkCompaction(void* arg);
2057   // Runs a pre-chosen universal compaction involving bottom level in a
2058   // separate, bottom-pri thread pool.
2059   static void BGWorkBottomCompaction(void* arg);
2060   static void BGWorkFlush(void* arg);
2061   static void BGWorkPurge(void* arg);
2062   static void UnscheduleCompactionCallback(void* arg);
2063   static void UnscheduleFlushCallback(void* arg);
2064   void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
2065                                 Env::Priority thread_pri);
2066   void BackgroundCallFlush(Env::Priority thread_pri);
2067   void BackgroundCallPurge();
2068   Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
2069                               LogBuffer* log_buffer,
2070                               PrepickedCompaction* prepicked_compaction,
2071                               Env::Priority thread_pri);
2072   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
2073                          LogBuffer* log_buffer, FlushReason* reason,
2074                          Env::Priority thread_pri);
2075
2076   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
2077                                const std::vector<CompactionInputFiles>& inputs,
2078                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
2079
2080   // Request compaction tasks token from compaction thread limiter.
2081   // It always succeeds if force = true or limiter is disable.
2082   bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
2083                               std::unique_ptr<TaskLimiterToken>* token,
2084                               LogBuffer* log_buffer);
2085
2086   // Schedule background tasks
2087   Status StartPeriodicTaskScheduler();
2088
2089   Status RegisterRecordSeqnoTimeWorker();
2090
2091   void PrintStatistics();
2092
2093   size_t EstimateInMemoryStatsHistorySize() const;
2094
2095   // Return the minimum empty level that could hold the total data in the
2096   // input level. Return the input level, if such level could not be found.
2097   int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
2098                                    const MutableCFOptions& mutable_cf_options,
2099                                    int level);
2100
2101   // Move the files in the input level to the target level.
2102   // If target_level < 0, automatically calculate the minimum level that could
2103   // hold the data set.
2104   Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
2105
2106   // helper functions for adding and removing from flush & compaction queues
2107   void AddToCompactionQueue(ColumnFamilyData* cfd);
2108   ColumnFamilyData* PopFirstFromCompactionQueue();
2109   FlushRequest PopFirstFromFlushQueue();
2110
2111   // Pick the first unthrottled compaction with task token from queue.
2112   ColumnFamilyData* PickCompactionFromQueue(
2113       std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
2114
2115   // helper function to call after some of the logs_ were synced
2116   void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
2117   Status ApplyWALToManifest(VersionEdit* edit);
2118   // WALs with log number up to up_to are not synced successfully.
2119   void MarkLogsNotSynced(uint64_t up_to);
2120
2121   SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
2122                                 bool lock = true);
2123
2124   // If snapshot_seq != kMaxSequenceNumber, then this function can only be
2125   // called from the write thread that publishes sequence numbers to readers.
2126   // For 1) write-committed, or 2) write-prepared + one-write-queue, this will
2127   // be the write thread performing memtable writes. For write-prepared with
2128   // two write queues, this will be the write thread writing commit marker to
2129   // the WAL.
2130   // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
2131   // ensuring no writes to the database.
2132   std::pair<Status, std::shared_ptr<const SnapshotImpl>>
2133   CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
2134                                 bool lock = true);
2135
2136   uint64_t GetMaxTotalWalSize() const;
2137
2138   FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
2139
2140   Status MaybeReleaseTimestampedSnapshotsAndCheck();
2141
2142   Status CloseHelper();
2143
2144   void WaitForBackgroundWork();
2145
2146   // Background threads call this function, which is just a wrapper around
2147   // the InstallSuperVersion() function. Background threads carry
2148   // sv_context which can have new_superversion already
2149   // allocated.
2150   // All ColumnFamily state changes go through this function. Here we analyze
2151   // the new state and we schedule background work if we detect that the new
2152   // state needs flush or compaction.
2153   void InstallSuperVersionAndScheduleWork(
2154       ColumnFamilyData* cfd, SuperVersionContext* sv_context,
2155       const MutableCFOptions& mutable_cf_options);
2156
2157   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
2158                               const DBPropertyInfo& property_info,
2159                               bool is_locked, uint64_t* value);
2160   bool GetPropertyHandleOptionsStatistics(std::string* value);
2161
2162   bool HasPendingManualCompaction();
2163   bool HasExclusiveManualCompaction();
2164   void AddManualCompaction(ManualCompactionState* m);
2165   void RemoveManualCompaction(ManualCompactionState* m);
2166   bool ShouldntRunManualCompaction(ManualCompactionState* m);
2167   bool HaveManualCompaction(ColumnFamilyData* cfd);
2168   bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
2169 #ifndef ROCKSDB_LITE
2170   void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
2171                               const Status& st,
2172                               const CompactionJobStats& compaction_job_stats,
2173                               const int job_id, const Version* current,
2174                               CompactionJobInfo* compaction_job_info) const;
2175   // Reserve the next 'num' file numbers for to-be-ingested external SST files,
2176   // and return the current file_number in 'next_file_number'.
2177   // Write a version edit to the MANIFEST.
2178   Status ReserveFileNumbersBeforeIngestion(
2179       ColumnFamilyData* cfd, uint64_t num,
2180       std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
2181       uint64_t* next_file_number);
2182 #endif  //! ROCKSDB_LITE
2183
2184   bool ShouldPurge(uint64_t file_number) const;
2185   void MarkAsGrabbedForPurge(uint64_t file_number);
2186
2187   size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
2188   Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
2189
2190   IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
2191                      size_t preallocate_block_size, log::Writer** new_log);
2192
2193   // Validate self-consistency of DB options
2194   static Status ValidateOptions(const DBOptions& db_options);
2195   // Validate self-consistency of DB options and its consistency with cf options
2196   static Status ValidateOptions(
2197       const DBOptions& db_options,
2198       const std::vector<ColumnFamilyDescriptor>& column_families);
2199
2200   // Utility function to do some debug validation and sort the given vector
2201   // of MultiGet keys
2202   void PrepareMultiGetKeys(
2203       const size_t num_keys, bool sorted,
2204       autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
2205
2206   // A structure to hold the information required to process MultiGet of keys
2207   // belonging to one column family. For a multi column family MultiGet, there
2208   // will be a container of these objects.
2209   struct MultiGetColumnFamilyData {
2210     ColumnFamilyHandle* cf;
2211     ColumnFamilyData* cfd;
2212
2213     // For the batched MultiGet which relies on sorted keys, start specifies
2214     // the index of first key belonging to this column family in the sorted
2215     // list.
2216     size_t start;
2217
2218     // For the batched MultiGet case, num_keys specifies the number of keys
2219     // belonging to this column family in the sorted list
2220     size_t num_keys;
2221
2222     // SuperVersion for the column family obtained in a manner that ensures a
2223     // consistent view across all column families in the DB
2224     SuperVersion* super_version;
2225     MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
2226                              SuperVersion* sv)
2227         : cf(column_family),
2228           cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
2229           start(0),
2230           num_keys(0),
2231           super_version(sv) {}
2232
2233     MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
2234                              size_t count, SuperVersion* sv)
2235         : cf(column_family),
2236           cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
2237           start(first),
2238           num_keys(count),
2239           super_version(sv) {}
2240
2241     MultiGetColumnFamilyData() = default;
2242   };
2243
2244   // A common function to obtain a consistent snapshot, which can be implicit
2245   // if the user doesn't specify a snapshot in read_options, across
2246   // multiple column families for MultiGet. It will attempt to get an implicit
2247   // snapshot without acquiring the db_mutes, but will give up after a few
2248   // tries and acquire the mutex if a memtable flush happens. The template
2249   // allows both the batched and non-batched MultiGet to call this with
2250   // either an std::unordered_map or autovector of column families.
2251   //
2252   // If callback is non-null, the callback is refreshed with the snapshot
2253   // sequence number
2254   //
2255   // A return value of true indicates that the SuperVersions were obtained
2256   // from the ColumnFamilyData, whereas false indicates they are thread
2257   // local
2258   template <class T>
2259   bool MultiCFSnapshot(
2260       const ReadOptions& read_options, ReadCallback* callback,
2261       std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
2262           iter_deref_func,
2263       T* cf_list, SequenceNumber* snapshot);
2264
2265   // The actual implementation of the batching MultiGet. The caller is expected
2266   // to have acquired the SuperVersion and pass in a snapshot sequence number
2267   // in order to construct the LookupKeys. The start_key and num_keys specify
2268   // the range of keys in the sorted_keys vector for a single column family.
2269   Status MultiGetImpl(
2270       const ReadOptions& read_options, size_t start_key, size_t num_keys,
2271       autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
2272       SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
2273
2274   Status DisableFileDeletionsWithLock();
2275
2276   Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
2277                                       std::string ts_low);
2278
2279   bool ShouldReferenceSuperVersion(const MergeContext& merge_context);
2280
2281   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
2282   FileLock* db_lock_;
2283
2284   // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
2285   InstrumentedMutex stats_history_mutex_;
2286   // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
2287   // logfile_number_. With two_write_queues it also protects alive_log_files_,
2288   // and log_empty_. Refer to the definition of each variable below for more
2289   // details.
2290   // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
2291   // mutex_, the order should be first mutex_ and then log_write_mutex_.
2292   InstrumentedMutex log_write_mutex_;
2293
2294   // If zero, manual compactions are allowed to proceed. If non-zero, manual
2295   // compactions may still be running, but will quickly fail with
2296   // `Status::Incomplete`. The value indicates how many threads have paused
2297   // manual compactions. It is accessed in read mode outside the DB mutex in
2298   // compaction code paths.
2299   std::atomic<int> manual_compaction_paused_;
2300
2301   // This condition variable is signaled on these conditions:
2302   // * whenever bg_compaction_scheduled_ goes down to 0
2303   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
2304   // made any progress
2305   // * whenever a compaction made any progress
2306   // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
2307   // (i.e. whenever a flush is done, even if it didn't make any progress)
2308   // * whenever there is an error in background purge, flush or compaction
2309   // * whenever num_running_ingest_file_ goes to 0.
2310   // * whenever pending_purge_obsolete_files_ goes to 0.
2311   // * whenever disable_delete_obsolete_files_ goes to 0.
2312   // * whenever SetOptions successfully updates options.
2313   // * whenever a column family is dropped.
2314   InstrumentedCondVar bg_cv_;
2315   // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
2316   // must be under either mutex_ or log_write_mutex_. Since after ::Open,
2317   // logfile_number_ is currently updated only in write_thread_, it can be read
2318   // from the same write_thread_ without any locks.
2319   uint64_t logfile_number_;
2320   // Log files that we can recycle. Must be protected by db mutex_.
2321   std::deque<uint64_t> log_recycle_files_;
2322   // Protected by log_write_mutex_.
2323   bool log_dir_synced_;
2324   // Without two_write_queues, read and writes to log_empty_ are protected by
2325   // mutex_. Since it is currently updated/read only in write_thread_, it can be
2326   // accessed from the same write_thread_ without any locks. With
2327   // two_write_queues writes, where it can be updated in different threads,
2328   // read and writes are protected by log_write_mutex_ instead. This is to avoid
2329   // expensive mutex_ lock during WAL write, which update log_empty_.
2330   bool log_empty_;
2331
2332   ColumnFamilyHandleImpl* persist_stats_cf_handle_;
2333
2334   bool persistent_stats_cfd_exists_ = true;
2335
2336   // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
2337   // as follows:
2338   // 1. read by FindObsoleteFiles() which can be called in either application
2339   //    thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
2340   //    held.
2341   // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
2342   //    are held.
2343   // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
2344   //    (actually called by Open()), only mutex_ is held because at this point,
2345   //    the DB::Open() call has not returned success to application, and the
2346   //    only other thread(s) that can conflict are bg threads calling
2347   //    FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
2348   //    are held when accessing alive_log_files_.
2349   // 4. read by DBImpl::Open() is protected by mutex_.
2350   // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
2351   //    held. This is done by the write group leader. Note that in the case of
2352   //    two-write-queues, another WAL-only write thread can be writing to the
2353   //    WAL concurrently. See 9.
2354   // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
2355   //    done by write group leader.
2356   // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
2357   //    two-write-queues. Only log_write_mutex_ is held to protect concurrent
2358   //    pop_front() by FindObsoleteFiles().
2359   // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
2360   //    is held to protect the data structure from concurrent pop_front() by
2361   //    FindObsoleteFiles().
2362   // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
2363   //    of two-write-queues. Only log_write_mutex_ is held. This suffices to
2364   //    protect the data structure from concurrent push_back() by current
2365   //    write group leader as well as pop_front() by FindObsoleteFiles().
2366   std::deque<LogFileNumberSize> alive_log_files_;
2367
2368   // Log files that aren't fully synced, and the current log file.
2369   // Synchronization:
2370   // 1. read by FindObsoleteFiles() which can be called either in application
2371   //    thread or RocksDB bg threads. log_write_mutex_ is always held, while
2372   //    some reads are performed without mutex_.
2373   // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
2374   // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
2375   // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
2376   //    Note that at this point, DB::Open() has not returned success to
2377   //    application, thus the only other thread(s) that can conflict are bg
2378   //    threads calling FindObsoleteFiles(). See 1.
2379   // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
2380   //    and mutex_.
2381   // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
2382   //    log_write_mutex_. These two can be called by application threads after
2383   //    DB::Open() returns success to applications.
2384   // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
2385   // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
2386   //    log_write_mutex_.
2387   // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
2388   // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
2389   //     happen in bg flush threads after DB::Open() returns success to
2390   //     applications.
2391   // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
2392   //     holds only the log_write_mutex_. This is done by the write group
2393   //     leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
2394   //     can happen concurrently. This is fine because log_write_mutex_ is used
2395   //     by all parties. See 2, 5, 9.
2396   // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
2397   //     log_write_mutex_. This happens in the write group leader.
2398   // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
2399   //     log_write_mutex_. This happens in the write group leader. Can conflict
2400   //     with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
2401   //     SyncClosedLogs(), etc. as well as application threads calling
2402   //     FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
2403   //     require at least log_write_mutex_.
2404   // 14. iteration called in WriteToWAL(write_group) protected by
2405   //     log_write_mutex_. This is done by write group leader when
2406   //     two-write-queues is disabled and write needs to sync logs.
2407   // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
2408   //     This can be done by the write group leader if two-write-queues is
2409   //     enabled. It can also be done by another WAL-only write thread.
2410   //
2411   // Other observations:
2412   //  - back() and items with getting_synced=true are not popped,
2413   //  - The same thread that sets getting_synced=true will reset it.
2414   //  - it follows that the object referred by back() can be safely read from
2415   //  the write_thread_ without using mutex. Note that calling back() without
2416   //  mutex may be unsafe because different implementations of deque::back() may
2417   //  access other member variables of deque, causing undefined behaviors.
2418   //  Generally, do not access stl containers without proper synchronization.
2419   //  - it follows that the items with getting_synced=true can be safely read
2420   //  from the same thread that has set getting_synced=true
2421   std::deque<LogWriterNumber> logs_;
2422
2423   // Signaled when getting_synced becomes false for some of the logs_.
2424   InstrumentedCondVar log_sync_cv_;
2425   // This is the app-level state that is written to the WAL but will be used
2426   // only during recovery. Using this feature enables not writing the state to
2427   // memtable on normal writes and hence improving the throughput. Each new
2428   // write of the state will replace the previous state entirely even if the
2429   // keys in the two consecutive states do not overlap.
2430   // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
2431   // Otherwise only the heaad of write_thread_ can access it.
2432   WriteBatch cached_recoverable_state_;
2433   std::atomic<bool> cached_recoverable_state_empty_ = {true};
2434   std::atomic<uint64_t> total_log_size_;
2435
2436   // If this is non-empty, we need to delete these log files in background
2437   // threads. Protected by log_write_mutex_.
2438   autovector<log::Writer*> logs_to_free_;
2439
2440   bool is_snapshot_supported_;
2441
2442   std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
2443
2444   std::map<std::string, uint64_t> stats_slice_;
2445
2446   bool stats_slice_initialized_ = false;
2447
2448   Directories directories_;
2449
2450   WriteBufferManager* write_buffer_manager_;
2451
2452   WriteThread write_thread_;
2453   WriteBatch tmp_batch_;
2454   // The write thread when the writers have no memtable write. This will be used
2455   // in 2PC to batch the prepares separately from the serial commit.
2456   WriteThread nonmem_write_thread_;
2457
2458   WriteController write_controller_;
2459
2460   // Size of the last batch group. In slowdown mode, next write needs to
2461   // sleep if it uses up the quota.
2462   // Note: This is to protect memtable and compaction. If the batch only writes
2463   // to the WAL its size need not to be included in this.
2464   uint64_t last_batch_group_size_;
2465
2466   FlushScheduler flush_scheduler_;
2467
2468   TrimHistoryScheduler trim_history_scheduler_;
2469
2470   SnapshotList snapshots_;
2471
2472   TimestampedSnapshotList timestamped_snapshots_;
2473
2474   // For each background job, pending_outputs_ keeps the current file number at
2475   // the time that background job started.
2476   // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
2477   // number bigger than any of the file number in pending_outputs_. Since file
2478   // numbers grow monotonically, this also means that pending_outputs_ is always
2479   // sorted. After a background job is done executing, its file number is
2480   // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
2481   // it up.
2482   // State is protected with db mutex.
2483   std::list<uint64_t> pending_outputs_;
2484
2485   // flush_queue_ and compaction_queue_ hold column families that we need to
2486   // flush and compact, respectively.
2487   // A column family is inserted into flush_queue_ when it satisfies condition
2488   // cfd->imm()->IsFlushPending()
2489   // A column family is inserted into compaction_queue_ when it satisfied
2490   // condition cfd->NeedsCompaction()
2491   // Column families in this list are all Ref()-erenced
2492   // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
2493   // do RAII on ColumnFamilyData
2494   // Column families are in this queue when they need to be flushed or
2495   // compacted. Consumers of these queues are flush and compaction threads. When
2496   // column family is put on this queue, we increase unscheduled_flushes_ and
2497   // unscheduled_compactions_. When these variables are bigger than zero, that
2498   // means we need to schedule background threads for flush and compaction.
2499   // Once the background threads are scheduled, we decrease unscheduled_flushes_
2500   // and unscheduled_compactions_. That way we keep track of number of
2501   // compaction and flush threads we need to schedule. This scheduling is done
2502   // in MaybeScheduleFlushOrCompaction()
2503   // invariant(column family present in flush_queue_ <==>
2504   // ColumnFamilyData::pending_flush_ == true)
2505   std::deque<FlushRequest> flush_queue_;
2506   // invariant(column family present in compaction_queue_ <==>
2507   // ColumnFamilyData::pending_compaction_ == true)
2508   std::deque<ColumnFamilyData*> compaction_queue_;
2509
2510   // A map to store file numbers and filenames of the files to be purged
2511   std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
2512
2513   // A vector to store the file numbers that have been assigned to certain
2514   // JobContext. Current implementation tracks table and blob files only.
2515   std::unordered_set<uint64_t> files_grabbed_for_purge_;
2516
2517   // A queue to store log writers to close. Protected by db mutex_.
2518   std::deque<log::Writer*> logs_to_free_queue_;
2519
2520   std::deque<SuperVersion*> superversions_to_free_queue_;
2521
2522   int unscheduled_flushes_;
2523
2524   int unscheduled_compactions_;
2525
2526   // count how many background compactions are running or have been scheduled in
2527   // the BOTTOM pool
2528   int bg_bottom_compaction_scheduled_;
2529
2530   // count how many background compactions are running or have been scheduled
2531   int bg_compaction_scheduled_;
2532
2533   // stores the number of compactions are currently running
2534   int num_running_compactions_;
2535
2536   // number of background memtable flush jobs, submitted to the HIGH pool
2537   int bg_flush_scheduled_;
2538
2539   // stores the number of flushes are currently running
2540   int num_running_flushes_;
2541
2542   // number of background obsolete file purge jobs, submitted to the HIGH pool
2543   int bg_purge_scheduled_;
2544
2545   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
2546
2547   // shall we disable deletion of obsolete files
2548   // if 0 the deletion is enabled.
2549   // if non-zero, files will not be getting deleted
2550   // This enables two different threads to call
2551   // EnableFileDeletions() and DisableFileDeletions()
2552   // without any synchronization
2553   int disable_delete_obsolete_files_;
2554
2555   // Number of times FindObsoleteFiles has found deletable files and the
2556   // corresponding call to PurgeObsoleteFiles has not yet finished.
2557   int pending_purge_obsolete_files_;
2558
2559   // last time when DeleteObsoleteFiles with full scan was executed. Originally
2560   // initialized with startup time.
2561   uint64_t delete_obsolete_files_last_run_;
2562
2563   // last time stats were dumped to LOG
2564   std::atomic<uint64_t> last_stats_dump_time_microsec_;
2565
2566   // The thread that wants to switch memtable, can wait on this cv until the
2567   // pending writes to memtable finishes.
2568   std::condition_variable switch_cv_;
2569   // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
2570   std::mutex switch_mutex_;
2571   // Number of threads intending to write to memtable
2572   std::atomic<size_t> pending_memtable_writes_ = {};
2573
2574   // A flag indicating whether the current rocksdb database has any
2575   // data that is not yet persisted into either WAL or SST file.
2576   // Used when disableWAL is true.
2577   std::atomic<bool> has_unpersisted_data_;
2578
2579   // if an attempt was made to flush all column families that
2580   // the oldest log depends on but uncommitted data in the oldest
2581   // log prevents the log from being released.
2582   // We must attempt to free the dependent memtables again
2583   // at a later time after the transaction in the oldest
2584   // log is fully commited.
2585   bool unable_to_release_oldest_log_;
2586
2587   // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
2588   // calls.
2589   // REQUIRES: mutex held
2590   int num_running_ingest_file_;
2591
2592 #ifndef ROCKSDB_LITE
2593   WalManager wal_manager_;
2594 #endif  // ROCKSDB_LITE
2595
2596   // A value of > 0 temporarily disables scheduling of background work
2597   int bg_work_paused_;
2598
2599   // A value of > 0 temporarily disables scheduling of background compaction
2600   int bg_compaction_paused_;
2601
2602   // Guard against multiple concurrent refitting
2603   bool refitting_level_;
2604
2605   // Indicate DB was opened successfully
2606   bool opened_successfully_;
2607
2608   // The min threshold to triggere bottommost compaction for removing
2609   // garbages, among all column families.
2610   SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
2611
2612   LogsWithPrepTracker logs_with_prep_tracker_;
2613
2614   // Callback for compaction to check if a key is visible to a snapshot.
2615   // REQUIRES: mutex held
2616   std::unique_ptr<SnapshotChecker> snapshot_checker_;
2617
2618   // Callback for when the cached_recoverable_state_ is written to memtable
2619   // Only to be set during initialization
2620   std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
2621
2622 #ifndef ROCKSDB_LITE
2623   // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
2624   // Currently, internally it has a global timer instance for running the tasks.
2625   PeriodicTaskScheduler periodic_task_scheduler_;
2626
2627   // It contains the implementations for each periodic task.
2628   std::map<PeriodicTaskType, const PeriodicTaskFunc> periodic_task_functions_;
2629 #endif
2630
2631   // When set, we use a separate queue for writes that don't write to memtable.
2632   // In 2PC these are the writes at Prepare phase.
2633   const bool two_write_queues_;
2634   const bool manual_wal_flush_;
2635
2636   // LastSequence also indicates last published sequence visibile to the
2637   // readers. Otherwise LastPublishedSequence should be used.
2638   const bool last_seq_same_as_publish_seq_;
2639   // It indicates that a customized gc algorithm must be used for
2640   // flush/compaction and if it is not provided vis SnapshotChecker, we should
2641   // disable gc to be safe.
2642   const bool use_custom_gc_;
2643   // Flag to indicate that the DB instance shutdown has been initiated. This
2644   // different from shutting_down_ atomic in that it is set at the beginning
2645   // of shutdown sequence, specifically in order to prevent any background
2646   // error recovery from going on in parallel. The latter, shutting_down_,
2647   // is set a little later during the shutdown after scheduling memtable
2648   // flushes
2649   std::atomic<bool> shutdown_initiated_;
2650   // Flag to indicate whether sst_file_manager object was allocated in
2651   // DB::Open() or passed to us
2652   bool own_sfm_;
2653
2654   // Flag to check whether Close() has been called on this DB
2655   bool closed_;
2656   // save the closing status, for re-calling the close()
2657   Status closing_status_;
2658   // mutex for DB::Close()
2659   InstrumentedMutex closing_mutex_;
2660
2661   // Conditional variable to coordinate installation of atomic flush results.
2662   // With atomic flush, each bg thread installs the result of flushing multiple
2663   // column families, and different threads can flush different column
2664   // families. It's difficult to rely on one thread to perform batch
2665   // installation for all threads. This is different from the non-atomic flush
2666   // case.
2667   // atomic_flush_install_cv_ makes sure that threads install atomic flush
2668   // results sequentially. Flush results of memtables with lower IDs get
2669   // installed to MANIFEST first.
2670   InstrumentedCondVar atomic_flush_install_cv_;
2671
2672   bool wal_in_db_path_;
2673   std::atomic<uint64_t> max_total_wal_size_;
2674
2675   BlobFileCompletionCallback blob_callback_;
2676
2677   // Pointer to WriteBufferManager stalling interface.
2678   std::unique_ptr<StallInterface> wbm_stall_;
2679
2680   // seqno_time_mapping_ stores the sequence number to time mapping, it's not
2681   // thread safe, both read and write need db mutex hold.
2682   SeqnoToTimeMapping seqno_time_mapping_;
2683 };
2684
2685 class GetWithTimestampReadCallback : public ReadCallback {
2686  public:
2687   explicit GetWithTimestampReadCallback(SequenceNumber seq)
2688       : ReadCallback(seq) {}
2689   bool IsVisibleFullCheck(SequenceNumber seq) override {
2690     return seq <= max_visible_seq_;
2691   }
2692 };
2693
2694 extern Options SanitizeOptions(const std::string& db, const Options& src,
2695                                bool read_only = false,
2696                                Status* logger_creation_s = nullptr);
2697
2698 extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
2699                                  bool read_only = false,
2700                                  Status* logger_creation_s = nullptr);
2701
2702 extern CompressionType GetCompressionFlush(
2703     const ImmutableCFOptions& ioptions,
2704     const MutableCFOptions& mutable_cf_options);
2705
2706 // Return the earliest log file to keep after the memtable flush is
2707 // finalized.
2708 // `cfd_to_flush` is the column family whose memtable (specified in
2709 // `memtables_to_flush`) will be flushed and thus will not depend on any WAL
2710 // file.
2711 // The function is only applicable to 2pc mode.
2712 extern uint64_t PrecomputeMinLogNumberToKeep2PC(
2713     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
2714     const autovector<VersionEdit*>& edit_list,
2715     const autovector<MemTable*>& memtables_to_flush,
2716     LogsWithPrepTracker* prep_tracker);
2717 // For atomic flush.
2718 extern uint64_t PrecomputeMinLogNumberToKeep2PC(
2719     VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
2720     const autovector<autovector<VersionEdit*>>& edit_lists,
2721     const autovector<const autovector<MemTable*>*>& memtables_to_flush,
2722     LogsWithPrepTracker* prep_tracker);
2723
2724 // In non-2PC mode, WALs with log number < the returned number can be
2725 // deleted after the cfd_to_flush column family is flushed successfully.
2726 extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
2727     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
2728     const autovector<VersionEdit*>& edit_list);
2729 // For atomic flush.
2730 extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
2731     VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
2732     const autovector<autovector<VersionEdit*>>& edit_lists);
2733
2734 // `cfd_to_flush` is the column family whose memtable will be flushed and thus
2735 // will not depend on any WAL file. nullptr means no memtable is being flushed.
2736 // The function is only applicable to 2pc mode.
2737 extern uint64_t FindMinPrepLogReferencedByMemTable(
2738     VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
2739 // For atomic flush.
2740 extern uint64_t FindMinPrepLogReferencedByMemTable(
2741     VersionSet* vset,
2742     const autovector<const autovector<MemTable*>*>& memtables_to_flush);
2743
2744 // Fix user-supplied options to be reasonable
2745 template <class T, class V>
2746 static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
2747   if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
2748   if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
2749 }
2750
2751 inline Status DBImpl::FailIfCfHasTs(
2752     const ColumnFamilyHandle* column_family) const {
2753   column_family = column_family ? column_family : DefaultColumnFamily();
2754   assert(column_family);
2755   const Comparator* const ucmp = column_family->GetComparator();
2756   assert(ucmp);
2757   if (ucmp->timestamp_size() > 0) {
2758     std::ostringstream oss;
2759     oss << "cannot call this method on column family "
2760         << column_family->GetName() << " that enables timestamp";
2761     return Status::InvalidArgument(oss.str());
2762   }
2763   return Status::OK();
2764 }
2765
2766 inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
2767                                          const Slice& ts,
2768                                          bool ts_for_read) const {
2769   if (!column_family) {
2770     return Status::InvalidArgument("column family handle cannot be null");
2771   }
2772   assert(column_family);
2773   const Comparator* const ucmp = column_family->GetComparator();
2774   assert(ucmp);
2775   if (0 == ucmp->timestamp_size()) {
2776     std::stringstream oss;
2777     oss << "cannot call this method on column family "
2778         << column_family->GetName() << " that does not enable timestamp";
2779     return Status::InvalidArgument(oss.str());
2780   }
2781   const size_t ts_sz = ts.size();
2782   if (ts_sz != ucmp->timestamp_size()) {
2783     std::stringstream oss;
2784     oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", "
2785         << ts_sz << " given";
2786     return Status::InvalidArgument(oss.str());
2787   }
2788   if (ts_for_read) {
2789     auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
2790     auto cfd = cfh->cfd();
2791     std::string current_ts_low = cfd->GetFullHistoryTsLow();
2792     if (!current_ts_low.empty() &&
2793         ucmp->CompareTimestamp(ts, current_ts_low) < 0) {
2794       std::stringstream oss;
2795       oss << "Read timestamp: " << ts.ToString(true)
2796           << " is smaller than full_history_ts_low: "
2797           << Slice(current_ts_low).ToString(true) << std::endl;
2798       return Status::InvalidArgument(oss.str());
2799     }
2800   }
2801   return Status::OK();
2802 }
2803
2804 }  // namespace ROCKSDB_NAMESPACE