ceph/src/rocksdb/include/rocksdb/listener.h

   1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   4 //
   5 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
   6
   7 #pragma once
   8
   9 #include <chrono>
  10 #include <memory>
  11 #include <string>
  12 #include <unordered_map>
  13 #include <vector>
  14
  15 #include "rocksdb/compaction_job_stats.h"
  16 #include "rocksdb/compression_type.h"
  17 #include "rocksdb/status.h"
  18 #include "rocksdb/table_properties.h"
  19
  20 namespace ROCKSDB_NAMESPACE {
  21
  22 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
  23     TablePropertiesCollection;
  24
  25 class DB;
  26 class ColumnFamilyHandle;
  27 class Status;
  28 struct CompactionJobStats;
  29
  30 enum class TableFileCreationReason {
  31   kFlush,
  32   kCompaction,
  33   kRecovery,
  34   kMisc,
  35 };
  36
  37 struct TableFileCreationBriefInfo {
  38   // the name of the database where the file was created
  39   std::string db_name;
  40   // the name of the column family where the file was created.
  41   std::string cf_name;
  42   // the path to the created file.
  43   std::string file_path;
  44   // the id of the job (which could be flush or compaction) that
  45   // created the file.
  46   int job_id;
  47   // reason of creating the table.
  48   TableFileCreationReason reason;
  49 };
  50
  51 struct TableFileCreationInfo : public TableFileCreationBriefInfo {
  52   TableFileCreationInfo() = default;
  53   explicit TableFileCreationInfo(TableProperties&& prop)
  54       : table_properties(prop) {}
  55   // the size of the file.
  56   uint64_t file_size;
  57   // Detailed properties of the created file.
  58   TableProperties table_properties;
  59   // The status indicating whether the creation was successful or not.
  60   Status status;
  61   // The checksum of the table file being created
  62   std::string file_checksum;
  63   // The checksum function name of checksum generator used for this table file
  64   std::string file_checksum_func_name;
  65 };
  66
  67 enum class CompactionReason : int {
  68   kUnknown = 0,
  69   // [Level] number of L0 files > level0_file_num_compaction_trigger
  70   kLevelL0FilesNum,
  71   // [Level] total size of level > MaxBytesForLevel()
  72   kLevelMaxLevelSize,
  73   // [Universal] Compacting for size amplification
  74   kUniversalSizeAmplification,
  75   // [Universal] Compacting for size ratio
  76   kUniversalSizeRatio,
  77   // [Universal] number of sorted runs > level0_file_num_compaction_trigger
  78   kUniversalSortedRunNum,
  79   // [FIFO] total size > max_table_files_size
  80   kFIFOMaxSize,
  81   // [FIFO] reduce number of files.
  82   kFIFOReduceNumFiles,
  83   // [FIFO] files with creation time < (current_time - interval)
  84   kFIFOTtl,
  85   // Manual compaction
  86   kManualCompaction,
  87   // DB::SuggestCompactRange() marked files for compaction
  88   kFilesMarkedForCompaction,
  89   // [Level] Automatic compaction within bottommost level to cleanup duplicate
  90   // versions of same user key, usually due to a released snapshot.
  91   kBottommostFiles,
  92   // Compaction based on TTL
  93   kTtl,
  94   // According to the comments in flush_job.cc, RocksDB treats flush as
  95   // a level 0 compaction in internal stats.
  96   kFlush,
  97   // Compaction caused by external sst file ingestion
  98   kExternalSstIngestion,
  99   // Compaction due to SST file being too old
 100   kPeriodicCompaction,
 101   // total number of compaction reasons, new reasons must be added above this.
 102   kNumOfReasons,
 103 };
 104
 105 enum class FlushReason : int {
 106   kOthers = 0x00,
 107   kGetLiveFiles = 0x01,
 108   kShutDown = 0x02,
 109   kExternalFileIngestion = 0x03,
 110   kManualCompaction = 0x04,
 111   kWriteBufferManager = 0x05,
 112   kWriteBufferFull = 0x06,
 113   kTest = 0x07,
 114   kDeleteFiles = 0x08,
 115   kAutoCompaction = 0x09,
 116   kManualFlush = 0x0a,
 117   kErrorRecovery = 0xb,
 118   // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
 119   // will not be called to avoid many small immutable memtables.
 120   kErrorRecoveryRetryFlush = 0xc,
 121 };
 122
 123 enum class BackgroundErrorReason {
 124   kFlush,
 125   kCompaction,
 126   kWriteCallback,
 127   kMemTable,
 128   kManifestWrite,
 129   kFlushNoWAL,
 130 };
 131
 132 enum class WriteStallCondition {
 133   kNormal,
 134   kDelayed,
 135   kStopped,
 136 };
 137
 138 struct WriteStallInfo {
 139   // the name of the column family
 140   std::string cf_name;
 141   // state of the write controller
 142   struct {
 143     WriteStallCondition cur;
 144     WriteStallCondition prev;
 145   } condition;
 146 };
 147
 148 #ifndef ROCKSDB_LITE
 149
 150 struct TableFileDeletionInfo {
 151   // The name of the database where the file was deleted.
 152   std::string db_name;
 153   // The path to the deleted file.
 154   std::string file_path;
 155   // The id of the job which deleted the file.
 156   int job_id;
 157   // The status indicating whether the deletion was successful or not.
 158   Status status;
 159 };
 160
 161 enum class FileOperationType {
 162   kRead,
 163   kWrite,
 164   kTruncate,
 165   kClose,
 166   kFlush,
 167   kSync,
 168   kFsync,
 169   kRangeSync
 170 };
 171
 172 struct FileOperationInfo {
 173   using Duration = std::chrono::nanoseconds;
 174   using SteadyTimePoint =
 175       std::chrono::time_point<std::chrono::steady_clock, Duration>;
 176   using SystemTimePoint =
 177       std::chrono::time_point<std::chrono::system_clock, Duration>;
 178   using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
 179   using FinishTimePoint = SteadyTimePoint;
 180
 181   FileOperationType type;
 182   const std::string& path;
 183   uint64_t offset;
 184   size_t length;
 185   const Duration duration;
 186   const SystemTimePoint& start_ts;
 187   Status status;
 188   FileOperationInfo(const FileOperationType _type, const std::string& _path,
 189                     const StartTimePoint& _start_ts,
 190                     const FinishTimePoint& _finish_ts, const Status& _status)
 191       : type(_type),
 192         path(_path),
 193         duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
 194             _finish_ts - _start_ts.second)),
 195         start_ts(_start_ts.first),
 196         status(_status) {}
 197   static StartTimePoint StartNow() {
 198     return std::make_pair<SystemTimePoint, SteadyTimePoint>(
 199         std::chrono::system_clock::now(), std::chrono::steady_clock::now());
 200   }
 201   static FinishTimePoint FinishNow() {
 202     return std::chrono::steady_clock::now();
 203   }
 204 };
 205
 206 struct FlushJobInfo {
 207   // the id of the column family
 208   uint32_t cf_id;
 209   // the name of the column family
 210   std::string cf_name;
 211   // the path to the newly created file
 212   std::string file_path;
 213   // the file number of the newly created file
 214   uint64_t file_number;
 215   // the oldest blob file referenced by the newly created file
 216   uint64_t oldest_blob_file_number;
 217   // the id of the thread that completed this flush job.
 218   uint64_t thread_id;
 219   // the job id, which is unique in the same thread.
 220   int job_id;
 221   // If true, then rocksdb is currently slowing-down all writes to prevent
 222   // creating too many Level 0 files as compaction seems not able to
 223   // catch up the write request speed.  This indicates that there are
 224   // too many files in Level 0.
 225   bool triggered_writes_slowdown;
 226   // If true, then rocksdb is currently blocking any writes to prevent
 227   // creating more L0 files.  This indicates that there are too many
 228   // files in level 0.  Compactions should try to compact L0 files down
 229   // to lower levels as soon as possible.
 230   bool triggered_writes_stop;
 231   // The smallest sequence number in the newly created file
 232   SequenceNumber smallest_seqno;
 233   // The largest sequence number in the newly created file
 234   SequenceNumber largest_seqno;
 235   // Table properties of the table being flushed
 236   TableProperties table_properties;
 237
 238   FlushReason flush_reason;
 239 };
 240
 241 struct CompactionFileInfo {
 242   // The level of the file.
 243   int level;
 244
 245   // The file number of the file.
 246   uint64_t file_number;
 247
 248   // The file number of the oldest blob file this SST file references.
 249   uint64_t oldest_blob_file_number;
 250 };
 251
 252 struct CompactionJobInfo {
 253   ~CompactionJobInfo() { status.PermitUncheckedError(); }
 254   // the id of the column family where the compaction happened.
 255   uint32_t cf_id;
 256   // the name of the column family where the compaction happened.
 257   std::string cf_name;
 258   // the status indicating whether the compaction was successful or not.
 259   Status status;
 260   // the id of the thread that completed this compaction job.
 261   uint64_t thread_id;
 262   // the job id, which is unique in the same thread.
 263   int job_id;
 264   // the smallest input level of the compaction.
 265   int base_input_level;
 266   // the output level of the compaction.
 267   int output_level;
 268
 269   // The following variables contain information about compaction inputs
 270   // and outputs. A file may appear in both the input and output lists
 271   // if it was simply moved to a different level. The order of elements
 272   // is the same across input_files and input_file_infos; similarly, it is
 273   // the same across output_files and output_file_infos.
 274
 275   // The names of the compaction input files.
 276   std::vector<std::string> input_files;
 277
 278   // Additional information about the compaction input files.
 279   std::vector<CompactionFileInfo> input_file_infos;
 280
 281   // The names of the compaction output files.
 282   std::vector<std::string> output_files;
 283
 284   // Additional information about the compaction output files.
 285   std::vector<CompactionFileInfo> output_file_infos;
 286
 287   // Table properties for input and output tables.
 288   // The map is keyed by values from input_files and output_files.
 289   TablePropertiesCollection table_properties;
 290
 291   // Reason to run the compaction
 292   CompactionReason compaction_reason;
 293
 294   // Compression algorithm used for output files
 295   CompressionType compression;
 296
 297   // Statistics and other additional details on the compaction
 298   CompactionJobStats stats;
 299 };
 300
 301 struct MemTableInfo {
 302   // the name of the column family to which memtable belongs
 303   std::string cf_name;
 304   // Sequence number of the first element that was inserted
 305   // into the memtable.
 306   SequenceNumber first_seqno;
 307   // Sequence number that is guaranteed to be smaller than or equal
 308   // to the sequence number of any key that could be inserted into this
 309   // memtable. It can then be assumed that any write with a larger(or equal)
 310   // sequence number will be present in this memtable or a later memtable.
 311   SequenceNumber earliest_seqno;
 312   // Total number of entries in memtable
 313   uint64_t num_entries;
 314   // Total number of deletes in memtable
 315   uint64_t num_deletes;
 316 };
 317
 318 struct ExternalFileIngestionInfo {
 319   // the name of the column family
 320   std::string cf_name;
 321   // Path of the file outside the DB
 322   std::string external_file_path;
 323   // Path of the file inside the DB
 324   std::string internal_file_path;
 325   // The global sequence number assigned to keys in this file
 326   SequenceNumber global_seqno;
 327   // Table properties of the table being flushed
 328   TableProperties table_properties;
 329 };
 330
 331 // EventListener class contains a set of callback functions that will
 332 // be called when specific RocksDB event happens such as flush.  It can
 333 // be used as a building block for developing custom features such as
 334 // stats-collector or external compaction algorithm.
 335 //
 336 // Note that callback functions should not run for an extended period of
 337 // time before the function returns, otherwise RocksDB may be blocked.
 338 // For example, it is not suggested to do DB::CompactFiles() (as it may
 339 // run for a long while) or issue many of DB::Put() (as Put may be blocked
 340 // in certain cases) in the same thread in the EventListener callback.
 341 // However, doing DB::CompactFiles() and DB::Put() in another thread is
 342 // considered safe.
 343 //
 344 // [Threading] All EventListener callback will be called using the
 345 // actual thread that involves in that specific event.   For example, it
 346 // is the RocksDB background flush thread that does the actual flush to
 347 // call EventListener::OnFlushCompleted().
 348 //
 349 // [Locking] All EventListener callbacks are designed to be called without
 350 // the current thread holding any DB mutex. This is to prevent potential
 351 // deadlock and performance issue when using EventListener callback
 352 // in a complex way.
 353 class EventListener {
 354  public:
 355   // A callback function to RocksDB which will be called whenever a
 356   // registered RocksDB flushes a file.  The default implementation is
 357   // no-op.
 358   //
 359   // Note that the this function must be implemented in a way such that
 360   // it should not run for an extended period of time before the function
 361   // returns.  Otherwise, RocksDB may be blocked.
 362   virtual void OnFlushCompleted(DB* /*db*/,
 363                                 const FlushJobInfo& /*flush_job_info*/) {}
 364
 365   // A callback function to RocksDB which will be called before a
 366   // RocksDB starts to flush memtables.  The default implementation is
 367   // no-op.
 368   //
 369   // Note that the this function must be implemented in a way such that
 370   // it should not run for an extended period of time before the function
 371   // returns.  Otherwise, RocksDB may be blocked.
 372   virtual void OnFlushBegin(DB* /*db*/,
 373                             const FlushJobInfo& /*flush_job_info*/) {}
 374
 375   // A callback function for RocksDB which will be called whenever
 376   // a SST file is deleted.  Different from OnCompactionCompleted and
 377   // OnFlushCompleted, this callback is designed for external logging
 378   // service and thus only provide string parameters instead
 379   // of a pointer to DB.  Applications that build logic basic based
 380   // on file creations and deletions is suggested to implement
 381   // OnFlushCompleted and OnCompactionCompleted.
 382   //
 383   // Note that if applications would like to use the passed reference
 384   // outside this function call, they should make copies from the
 385   // returned value.
 386   virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
 387
 388   // A callback function to RocksDB which will be called before a
 389   // RocksDB starts to compact.  The default implementation is
 390   // no-op.
 391   //
 392   // Note that the this function must be implemented in a way such that
 393   // it should not run for an extended period of time before the function
 394   // returns.  Otherwise, RocksDB may be blocked.
 395   virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
 396
 397   // A callback function for RocksDB which will be called whenever
 398   // a registered RocksDB compacts a file. The default implementation
 399   // is a no-op.
 400   //
 401   // Note that this function must be implemented in a way such that
 402   // it should not run for an extended period of time before the function
 403   // returns. Otherwise, RocksDB may be blocked.
 404   //
 405   // @param db a pointer to the rocksdb instance which just compacted
 406   //   a file.
 407   // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
 408   //  after this function is returned, and must be copied if it is needed
 409   //  outside of this function.
 410   virtual void OnCompactionCompleted(DB* /*db*/,
 411                                      const CompactionJobInfo& /*ci*/) {}
 412
 413   // A callback function for RocksDB which will be called whenever
 414   // a SST file is created.  Different from OnCompactionCompleted and
 415   // OnFlushCompleted, this callback is designed for external logging
 416   // service and thus only provide string parameters instead
 417   // of a pointer to DB.  Applications that build logic basic based
 418   // on file creations and deletions is suggested to implement
 419   // OnFlushCompleted and OnCompactionCompleted.
 420   //
 421   // Historically it will only be called if the file is successfully created.
 422   // Now it will also be called on failure case. User can check info.status
 423   // to see if it succeeded or not.
 424   //
 425   // Note that if applications would like to use the passed reference
 426   // outside this function call, they should make copies from these
 427   // returned value.
 428   virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
 429
 430   // A callback function for RocksDB which will be called before
 431   // a SST file is being created. It will follow by OnTableFileCreated after
 432   // the creation finishes.
 433   //
 434   // Note that if applications would like to use the passed reference
 435   // outside this function call, they should make copies from these
 436   // returned value.
 437   virtual void OnTableFileCreationStarted(
 438       const TableFileCreationBriefInfo& /*info*/) {}
 439
 440   // A callback function for RocksDB which will be called before
 441   // a memtable is made immutable.
 442   //
 443   // Note that the this function must be implemented in a way such that
 444   // it should not run for an extended period of time before the function
 445   // returns.  Otherwise, RocksDB may be blocked.
 446   //
 447   // Note that if applications would like to use the passed reference
 448   // outside this function call, they should make copies from these
 449   // returned value.
 450   virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
 451
 452   // A callback function for RocksDB which will be called before
 453   // a column family handle is deleted.
 454   //
 455   // Note that the this function must be implemented in a way such that
 456   // it should not run for an extended period of time before the function
 457   // returns.  Otherwise, RocksDB may be blocked.
 458   // @param handle is a pointer to the column family handle to be deleted
 459   // which will become a dangling pointer after the deletion.
 460   virtual void OnColumnFamilyHandleDeletionStarted(
 461       ColumnFamilyHandle* /*handle*/) {}
 462
 463   // A callback function for RocksDB which will be called after an external
 464   // file is ingested using IngestExternalFile.
 465   //
 466   // Note that the this function will run on the same thread as
 467   // IngestExternalFile(), if this function is blocked, IngestExternalFile()
 468   // will be blocked from finishing.
 469   virtual void OnExternalFileIngested(
 470       DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
 471
 472   // A callback function for RocksDB which will be called before setting the
 473   // background error status to a non-OK value. The new background error status
 474   // is provided in `bg_error` and can be modified by the callback. E.g., a
 475   // callback can suppress errors by resetting it to Status::OK(), thus
 476   // preventing the database from entering read-only mode. We do not provide any
 477   // guarantee when failed flushes/compactions will be rescheduled if the user
 478   // suppresses an error.
 479   //
 480   // Note that this function can run on the same threads as flush, compaction,
 481   // and user writes. So, it is extremely important not to perform heavy
 482   // computations or blocking calls in this function.
 483   virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
 484                                  Status* /* bg_error */) {}
 485
 486   // A callback function for RocksDB which will be called whenever a change
 487   // of superversion triggers a change of the stall conditions.
 488   //
 489   // Note that the this function must be implemented in a way such that
 490   // it should not run for an extended period of time before the function
 491   // returns.  Otherwise, RocksDB may be blocked.
 492   virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
 493
 494   // A callback function for RocksDB which will be called whenever a file read
 495   // operation finishes.
 496   virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
 497
 498   // A callback function for RocksDB which will be called whenever a file write
 499   // operation finishes.
 500   virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
 501
 502   // A callback function for RocksDB which will be called whenever a file flush
 503   // operation finishes.
 504   virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
 505
 506   // A callback function for RocksDB which will be called whenever a file sync
 507   // operation finishes.
 508   virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
 509
 510   // A callback function for RocksDB which will be called whenever a file
 511   // rangeSync operation finishes.
 512   virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
 513
 514   // A callback function for RocksDB which will be called whenever a file
 515   // truncate operation finishes.
 516   virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
 517
 518   // A callback function for RocksDB which will be called whenever a file close
 519   // operation finishes.
 520   virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
 521
 522   // If true, the OnFile*Finish functions will be called. If
 523   // false, then they won't be called.
 524   virtual bool ShouldBeNotifiedOnFileIO() { return false; }
 525
 526   // A callback function for RocksDB which will be called just before
 527   // starting the automatic recovery process for recoverable background
 528   // errors, such as NoSpace(). The callback can suppress the automatic
 529   // recovery by setting *auto_recovery to false. The database will then
 530   // have to be transitioned out of read-only mode by calling DB::Resume()
 531   virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
 532                                     Status /* bg_error */,
 533                                     bool* /* auto_recovery */) {}
 534
 535   // A callback function for RocksDB which will be called once the database
 536   // is recovered from read-only mode after an error. When this is called, it
 537   // means normal writes to the database can be issued and the user can
 538   // initiate any further recovery actions needed
 539   virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
 540
 541   virtual ~EventListener() {}
 542 };
 543
 544 #else
 545
 546 class EventListener {};
 547 struct FlushJobInfo {};
 548
 549 #endif  // ROCKSDB_LITE
 550
 551 }  // namespace ROCKSDB_NAMESPACE