ceph/src/rocksdb/include/rocksdb/listener.h

   1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   4
   5 #pragma once
   6
   7 #include <chrono>
   8 #include <memory>
   9 #include <string>
  10 #include <unordered_map>
  11 #include <vector>
  12 #include "rocksdb/compaction_job_stats.h"
  13 #include "rocksdb/status.h"
  14 #include "rocksdb/table_properties.h"
  15
  16 namespace rocksdb {
  17
  18 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
  19     TablePropertiesCollection;
  20
  21 class DB;
  22 class ColumnFamilyHandle;
  23 class Status;
  24 struct CompactionJobStats;
  25 enum CompressionType : unsigned char;
  26
  27 enum class TableFileCreationReason {
  28   kFlush,
  29   kCompaction,
  30   kRecovery,
  31   kMisc,
  32 };
  33
  34 struct TableFileCreationBriefInfo {
  35   // the name of the database where the file was created
  36   std::string db_name;
  37   // the name of the column family where the file was created.
  38   std::string cf_name;
  39   // the path to the created file.
  40   std::string file_path;
  41   // the id of the job (which could be flush or compaction) that
  42   // created the file.
  43   int job_id;
  44   // reason of creating the table.
  45   TableFileCreationReason reason;
  46 };
  47
  48 struct TableFileCreationInfo : public TableFileCreationBriefInfo {
  49   TableFileCreationInfo() = default;
  50   explicit TableFileCreationInfo(TableProperties&& prop)
  51       : table_properties(prop) {}
  52   // the size of the file.
  53   uint64_t file_size;
  54   // Detailed properties of the created file.
  55   TableProperties table_properties;
  56   // The status indicating whether the creation was successful or not.
  57   Status status;
  58 };
  59
  60 enum class CompactionReason : int {
  61   kUnknown = 0,
  62   // [Level] number of L0 files > level0_file_num_compaction_trigger
  63   kLevelL0FilesNum,
  64   // [Level] total size of level > MaxBytesForLevel()
  65   kLevelMaxLevelSize,
  66   // [Universal] Compacting for size amplification
  67   kUniversalSizeAmplification,
  68   // [Universal] Compacting for size ratio
  69   kUniversalSizeRatio,
  70   // [Universal] number of sorted runs > level0_file_num_compaction_trigger
  71   kUniversalSortedRunNum,
  72   // [FIFO] total size > max_table_files_size
  73   kFIFOMaxSize,
  74   // [FIFO] reduce number of files.
  75   kFIFOReduceNumFiles,
  76   // [FIFO] files with creation time < (current_time - interval)
  77   kFIFOTtl,
  78   // Manual compaction
  79   kManualCompaction,
  80   // DB::SuggestCompactRange() marked files for compaction
  81   kFilesMarkedForCompaction,
  82   // [Level] Automatic compaction within bottommost level to cleanup duplicate
  83   // versions of same user key, usually due to a released snapshot.
  84   kBottommostFiles,
  85   // Compaction based on TTL
  86   kTtl,
  87   // According to the comments in flush_job.cc, RocksDB treats flush as
  88   // a level 0 compaction in internal stats.
  89   kFlush,
  90   // Compaction caused by external sst file ingestion
  91   kExternalSstIngestion,
  92   // total number of compaction reasons, new reasons must be added above this.
  93   kNumOfReasons,
  94 };
  95
  96 enum class FlushReason : int {
  97   kOthers = 0x00,
  98   kGetLiveFiles = 0x01,
  99   kShutDown = 0x02,
 100   kExternalFileIngestion = 0x03,
 101   kManualCompaction = 0x04,
 102   kWriteBufferManager = 0x05,
 103   kWriteBufferFull = 0x06,
 104   kTest = 0x07,
 105   kDeleteFiles = 0x08,
 106   kAutoCompaction = 0x09,
 107   kManualFlush = 0x0a,
 108   kErrorRecovery = 0xb,
 109 };
 110
 111 enum class BackgroundErrorReason {
 112   kFlush,
 113   kCompaction,
 114   kWriteCallback,
 115   kMemTable,
 116 };
 117
 118 enum class WriteStallCondition {
 119   kNormal,
 120   kDelayed,
 121   kStopped,
 122 };
 123
 124 struct WriteStallInfo {
 125   // the name of the column family
 126   std::string cf_name;
 127   // state of the write controller
 128   struct {
 129     WriteStallCondition cur;
 130     WriteStallCondition prev;
 131   } condition;
 132 };
 133
 134 #ifndef ROCKSDB_LITE
 135
 136 struct TableFileDeletionInfo {
 137   // The name of the database where the file was deleted.
 138   std::string db_name;
 139   // The path to the deleted file.
 140   std::string file_path;
 141   // The id of the job which deleted the file.
 142   int job_id;
 143   // The status indicating whether the deletion was successful or not.
 144   Status status;
 145 };
 146
 147 struct FileOperationInfo {
 148   using TimePoint = std::chrono::time_point<std::chrono::system_clock,
 149                                             std::chrono::nanoseconds>;
 150
 151   const std::string& path;
 152   uint64_t offset;
 153   size_t length;
 154   const TimePoint& start_timestamp;
 155   const TimePoint& finish_timestamp;
 156   Status status;
 157   FileOperationInfo(const std::string& _path, const TimePoint& start,
 158                     const TimePoint& finish)
 159       : path(_path), start_timestamp(start), finish_timestamp(finish) {}
 160 };
 161
 162 struct FlushJobInfo {
 163   // the id of the column family
 164   uint32_t cf_id;
 165   // the name of the column family
 166   std::string cf_name;
 167   // the path to the newly created file
 168   std::string file_path;
 169   // the id of the thread that completed this flush job.
 170   uint64_t thread_id;
 171   // the job id, which is unique in the same thread.
 172   int job_id;
 173   // If true, then rocksdb is currently slowing-down all writes to prevent
 174   // creating too many Level 0 files as compaction seems not able to
 175   // catch up the write request speed.  This indicates that there are
 176   // too many files in Level 0.
 177   bool triggered_writes_slowdown;
 178   // If true, then rocksdb is currently blocking any writes to prevent
 179   // creating more L0 files.  This indicates that there are too many
 180   // files in level 0.  Compactions should try to compact L0 files down
 181   // to lower levels as soon as possible.
 182   bool triggered_writes_stop;
 183   // The smallest sequence number in the newly created file
 184   SequenceNumber smallest_seqno;
 185   // The largest sequence number in the newly created file
 186   SequenceNumber largest_seqno;
 187   // Table properties of the table being flushed
 188   TableProperties table_properties;
 189
 190   FlushReason flush_reason;
 191 };
 192
 193 struct CompactionJobInfo {
 194   CompactionJobInfo() = default;
 195   explicit CompactionJobInfo(const CompactionJobStats& _stats)
 196       : stats(_stats) {}
 197
 198   // the id of the column family where the compaction happened.
 199   uint32_t cf_id;
 200   // the name of the column family where the compaction happened.
 201   std::string cf_name;
 202   // the status indicating whether the compaction was successful or not.
 203   Status status;
 204   // the id of the thread that completed this compaction job.
 205   uint64_t thread_id;
 206   // the job id, which is unique in the same thread.
 207   int job_id;
 208   // the smallest input level of the compaction.
 209   int base_input_level;
 210   // the output level of the compaction.
 211   int output_level;
 212   // the names of the compaction input files.
 213   std::vector<std::string> input_files;
 214
 215   // the names of the compaction output files.
 216   std::vector<std::string> output_files;
 217   // Table properties for input and output tables.
 218   // The map is keyed by values from input_files and output_files.
 219   TablePropertiesCollection table_properties;
 220
 221   // Reason to run the compaction
 222   CompactionReason compaction_reason;
 223
 224   // Compression algorithm used for output files
 225   CompressionType compression;
 226
 227   // If non-null, this variable stores detailed information
 228   // about this compaction.
 229   CompactionJobStats stats;
 230 };
 231
 232 struct MemTableInfo {
 233   // the name of the column family to which memtable belongs
 234   std::string cf_name;
 235   // Sequence number of the first element that was inserted
 236   // into the memtable.
 237   SequenceNumber first_seqno;
 238   // Sequence number that is guaranteed to be smaller than or equal
 239   // to the sequence number of any key that could be inserted into this
 240   // memtable. It can then be assumed that any write with a larger(or equal)
 241   // sequence number will be present in this memtable or a later memtable.
 242   SequenceNumber earliest_seqno;
 243   // Total number of entries in memtable
 244   uint64_t num_entries;
 245   // Total number of deletes in memtable
 246   uint64_t num_deletes;
 247 };
 248
 249 struct ExternalFileIngestionInfo {
 250   // the name of the column family
 251   std::string cf_name;
 252   // Path of the file outside the DB
 253   std::string external_file_path;
 254   // Path of the file inside the DB
 255   std::string internal_file_path;
 256   // The global sequence number assigned to keys in this file
 257   SequenceNumber global_seqno;
 258   // Table properties of the table being flushed
 259   TableProperties table_properties;
 260 };
 261
 262 // EventListener class contains a set of callback functions that will
 263 // be called when specific RocksDB event happens such as flush.  It can
 264 // be used as a building block for developing custom features such as
 265 // stats-collector or external compaction algorithm.
 266 //
 267 // Note that callback functions should not run for an extended period of
 268 // time before the function returns, otherwise RocksDB may be blocked.
 269 // For example, it is not suggested to do DB::CompactFiles() (as it may
 270 // run for a long while) or issue many of DB::Put() (as Put may be blocked
 271 // in certain cases) in the same thread in the EventListener callback.
 272 // However, doing DB::CompactFiles() and DB::Put() in another thread is
 273 // considered safe.
 274 //
 275 // [Threading] All EventListener callback will be called using the
 276 // actual thread that involves in that specific event.   For example, it
 277 // is the RocksDB background flush thread that does the actual flush to
 278 // call EventListener::OnFlushCompleted().
 279 //
 280 // [Locking] All EventListener callbacks are designed to be called without
 281 // the current thread holding any DB mutex. This is to prevent potential
 282 // deadlock and performance issue when using EventListener callback
 283 // in a complex way.
 284 class EventListener {
 285  public:
 286   // A callback function to RocksDB which will be called whenever a
 287   // registered RocksDB flushes a file.  The default implementation is
 288   // no-op.
 289   //
 290   // Note that the this function must be implemented in a way such that
 291   // it should not run for an extended period of time before the function
 292   // returns.  Otherwise, RocksDB may be blocked.
 293   virtual void OnFlushCompleted(DB* /*db*/,
 294                                 const FlushJobInfo& /*flush_job_info*/) {}
 295
 296   // A callback function to RocksDB which will be called before a
 297   // RocksDB starts to flush memtables.  The default implementation is
 298   // no-op.
 299   //
 300   // Note that the this function must be implemented in a way such that
 301   // it should not run for an extended period of time before the function
 302   // returns.  Otherwise, RocksDB may be blocked.
 303   virtual void OnFlushBegin(DB* /*db*/,
 304                             const FlushJobInfo& /*flush_job_info*/) {}
 305
 306   // A callback function for RocksDB which will be called whenever
 307   // a SST file is deleted.  Different from OnCompactionCompleted and
 308   // OnFlushCompleted, this callback is designed for external logging
 309   // service and thus only provide string parameters instead
 310   // of a pointer to DB.  Applications that build logic basic based
 311   // on file creations and deletions is suggested to implement
 312   // OnFlushCompleted and OnCompactionCompleted.
 313   //
 314   // Note that if applications would like to use the passed reference
 315   // outside this function call, they should make copies from the
 316   // returned value.
 317   virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
 318
 319   // A callback function to RocksDB which will be called before a
 320   // RocksDB starts to compact.  The default implementation is
 321   // no-op.
 322   //
 323   // Note that the this function must be implemented in a way such that
 324   // it should not run for an extended period of time before the function
 325   // returns.  Otherwise, RocksDB may be blocked.
 326   virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
 327
 328   // A callback function for RocksDB which will be called whenever
 329   // a registered RocksDB compacts a file. The default implementation
 330   // is a no-op.
 331   //
 332   // Note that this function must be implemented in a way such that
 333   // it should not run for an extended period of time before the function
 334   // returns. Otherwise, RocksDB may be blocked.
 335   //
 336   // @param db a pointer to the rocksdb instance which just compacted
 337   //   a file.
 338   // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
 339   //  after this function is returned, and must be copied if it is needed
 340   //  outside of this function.
 341   virtual void OnCompactionCompleted(DB* /*db*/,
 342                                      const CompactionJobInfo& /*ci*/) {}
 343
 344   // A callback function for RocksDB which will be called whenever
 345   // a SST file is created.  Different from OnCompactionCompleted and
 346   // OnFlushCompleted, this callback is designed for external logging
 347   // service and thus only provide string parameters instead
 348   // of a pointer to DB.  Applications that build logic basic based
 349   // on file creations and deletions is suggested to implement
 350   // OnFlushCompleted and OnCompactionCompleted.
 351   //
 352   // Historically it will only be called if the file is successfully created.
 353   // Now it will also be called on failure case. User can check info.status
 354   // to see if it succeeded or not.
 355   //
 356   // Note that if applications would like to use the passed reference
 357   // outside this function call, they should make copies from these
 358   // returned value.
 359   virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
 360
 361   // A callback function for RocksDB which will be called before
 362   // a SST file is being created. It will follow by OnTableFileCreated after
 363   // the creation finishes.
 364   //
 365   // Note that if applications would like to use the passed reference
 366   // outside this function call, they should make copies from these
 367   // returned value.
 368   virtual void OnTableFileCreationStarted(
 369       const TableFileCreationBriefInfo& /*info*/) {}
 370
 371   // A callback function for RocksDB which will be called before
 372   // a memtable is made immutable.
 373   //
 374   // Note that the this function must be implemented in a way such that
 375   // it should not run for an extended period of time before the function
 376   // returns.  Otherwise, RocksDB may be blocked.
 377   //
 378   // Note that if applications would like to use the passed reference
 379   // outside this function call, they should make copies from these
 380   // returned value.
 381   virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
 382
 383   // A callback function for RocksDB which will be called before
 384   // a column family handle is deleted.
 385   //
 386   // Note that the this function must be implemented in a way such that
 387   // it should not run for an extended period of time before the function
 388   // returns.  Otherwise, RocksDB may be blocked.
 389   // @param handle is a pointer to the column family handle to be deleted
 390   // which will become a dangling pointer after the deletion.
 391   virtual void OnColumnFamilyHandleDeletionStarted(
 392       ColumnFamilyHandle* /*handle*/) {}
 393
 394   // A callback function for RocksDB which will be called after an external
 395   // file is ingested using IngestExternalFile.
 396   //
 397   // Note that the this function will run on the same thread as
 398   // IngestExternalFile(), if this function is blocked, IngestExternalFile()
 399   // will be blocked from finishing.
 400   virtual void OnExternalFileIngested(
 401       DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
 402
 403   // A callback function for RocksDB which will be called before setting the
 404   // background error status to a non-OK value. The new background error status
 405   // is provided in `bg_error` and can be modified by the callback. E.g., a
 406   // callback can suppress errors by resetting it to Status::OK(), thus
 407   // preventing the database from entering read-only mode. We do not provide any
 408   // guarantee when failed flushes/compactions will be rescheduled if the user
 409   // suppresses an error.
 410   //
 411   // Note that this function can run on the same threads as flush, compaction,
 412   // and user writes. So, it is extremely important not to perform heavy
 413   // computations or blocking calls in this function.
 414   virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
 415                                  Status* /* bg_error */) {}
 416
 417   // A callback function for RocksDB which will be called whenever a change
 418   // of superversion triggers a change of the stall conditions.
 419   //
 420   // Note that the this function must be implemented in a way such that
 421   // it should not run for an extended period of time before the function
 422   // returns.  Otherwise, RocksDB may be blocked.
 423   virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
 424
 425   // A callback function for RocksDB which will be called whenever a file read
 426   // operation finishes.
 427   virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
 428
 429   // A callback function for RocksDB which will be called whenever a file write
 430   // operation finishes.
 431   virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
 432
 433   // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
 434   // false, then they won't be called.
 435   virtual bool ShouldBeNotifiedOnFileIO() { return false; }
 436
 437   // A callback function for RocksDB which will be called just before
 438   // starting the automatic recovery process for recoverable background
 439   // errors, such as NoSpace(). The callback can suppress the automatic
 440   // recovery by setting *auto_recovery to false. The database will then
 441   // have to be transitioned out of read-only mode by calling DB::Resume()
 442   virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
 443                                     Status /* bg_error */,
 444                                     bool* /* auto_recovery */) {}
 445
 446   // A callback function for RocksDB which will be called once the database
 447   // is recovered from read-only mode after an error. When this is called, it
 448   // means normal writes to the database can be issued and the user can
 449   // initiate any further recovery actions needed
 450   virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
 451
 452   virtual ~EventListener() {}
 453 };
 454
 455 #else
 456
 457 class EventListener {};
 458
 459 #endif  // ROCKSDB_LITE
 460
 461 }  // namespace rocksdb