1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include <unordered_map>
12 #include "rocksdb/compaction_job_stats.h"
13 #include "rocksdb/status.h"
14 #include "rocksdb/table_properties.h"
18 typedef std::unordered_map
<std::string
, std::shared_ptr
<const TableProperties
>>
19 TablePropertiesCollection
;
22 class ColumnFamilyHandle
;
24 struct CompactionJobStats
;
25 enum CompressionType
: unsigned char;
27 enum class TableFileCreationReason
{
34 struct TableFileCreationBriefInfo
{
35 // the name of the database where the file was created
37 // the name of the column family where the file was created.
39 // the path to the created file.
40 std::string file_path
;
41 // the id of the job (which could be flush or compaction) that
44 // reason of creating the table.
45 TableFileCreationReason reason
;
48 struct TableFileCreationInfo
: public TableFileCreationBriefInfo
{
49 TableFileCreationInfo() = default;
50 explicit TableFileCreationInfo(TableProperties
&& prop
)
51 : table_properties(prop
) {}
52 // the size of the file.
54 // Detailed properties of the created file.
55 TableProperties table_properties
;
56 // The status indicating whether the creation was successful or not.
60 enum class CompactionReason
: int {
62 // [Level] number of L0 files > level0_file_num_compaction_trigger
64 // [Level] total size of level > MaxBytesForLevel()
66 // [Universal] Compacting for size amplification
67 kUniversalSizeAmplification
,
68 // [Universal] Compacting for size ratio
70 // [Universal] number of sorted runs > level0_file_num_compaction_trigger
71 kUniversalSortedRunNum
,
72 // [FIFO] total size > max_table_files_size
74 // [FIFO] reduce number of files.
76 // [FIFO] files with creation time < (current_time - interval)
80 // DB::SuggestCompactRange() marked files for compaction
81 kFilesMarkedForCompaction
,
82 // [Level] Automatic compaction within bottommost level to cleanup duplicate
83 // versions of same user key, usually due to a released snapshot.
85 // Compaction based on TTL
87 // According to the comments in flush_job.cc, RocksDB treats flush as
88 // a level 0 compaction in internal stats.
90 // Compaction caused by external sst file ingestion
91 kExternalSstIngestion
,
92 // total number of compaction reasons, new reasons must be added above this.
96 enum class FlushReason
: int {
100 kExternalFileIngestion
= 0x03,
101 kManualCompaction
= 0x04,
102 kWriteBufferManager
= 0x05,
103 kWriteBufferFull
= 0x06,
106 kAutoCompaction
= 0x09,
108 kErrorRecovery
= 0xb,
111 enum class BackgroundErrorReason
{
118 enum class WriteStallCondition
{
124 struct WriteStallInfo
{
125 // the name of the column family
127 // state of the write controller
129 WriteStallCondition cur
;
130 WriteStallCondition prev
;
136 struct TableFileDeletionInfo
{
137 // The name of the database where the file was deleted.
139 // The path to the deleted file.
140 std::string file_path
;
141 // The id of the job which deleted the file.
143 // The status indicating whether the deletion was successful or not.
147 struct FileOperationInfo
{
148 using TimePoint
= std::chrono::time_point
<std::chrono::system_clock
,
149 std::chrono::nanoseconds
>;
151 const std::string
& path
;
154 const TimePoint
& start_timestamp
;
155 const TimePoint
& finish_timestamp
;
157 FileOperationInfo(const std::string
& _path
, const TimePoint
& start
,
158 const TimePoint
& finish
)
159 : path(_path
), start_timestamp(start
), finish_timestamp(finish
) {}
162 struct FlushJobInfo
{
163 // the id of the column family
165 // the name of the column family
167 // the path to the newly created file
168 std::string file_path
;
169 // the id of the thread that completed this flush job.
171 // the job id, which is unique in the same thread.
173 // If true, then rocksdb is currently slowing-down all writes to prevent
174 // creating too many Level 0 files as compaction seems not able to
175 // catch up the write request speed. This indicates that there are
176 // too many files in Level 0.
177 bool triggered_writes_slowdown
;
178 // If true, then rocksdb is currently blocking any writes to prevent
179 // creating more L0 files. This indicates that there are too many
180 // files in level 0. Compactions should try to compact L0 files down
181 // to lower levels as soon as possible.
182 bool triggered_writes_stop
;
183 // The smallest sequence number in the newly created file
184 SequenceNumber smallest_seqno
;
185 // The largest sequence number in the newly created file
186 SequenceNumber largest_seqno
;
187 // Table properties of the table being flushed
188 TableProperties table_properties
;
190 FlushReason flush_reason
;
193 struct CompactionJobInfo
{
194 CompactionJobInfo() = default;
195 explicit CompactionJobInfo(const CompactionJobStats
& _stats
)
198 // the id of the column family where the compaction happened.
200 // the name of the column family where the compaction happened.
202 // the status indicating whether the compaction was successful or not.
204 // the id of the thread that completed this compaction job.
206 // the job id, which is unique in the same thread.
208 // the smallest input level of the compaction.
209 int base_input_level
;
210 // the output level of the compaction.
212 // the names of the compaction input files.
213 std::vector
<std::string
> input_files
;
215 // the names of the compaction output files.
216 std::vector
<std::string
> output_files
;
217 // Table properties for input and output tables.
218 // The map is keyed by values from input_files and output_files.
219 TablePropertiesCollection table_properties
;
221 // Reason to run the compaction
222 CompactionReason compaction_reason
;
224 // Compression algorithm used for output files
225 CompressionType compression
;
227 // If non-null, this variable stores detailed information
228 // about this compaction.
229 CompactionJobStats stats
;
232 struct MemTableInfo
{
233 // the name of the column family to which memtable belongs
235 // Sequence number of the first element that was inserted
236 // into the memtable.
237 SequenceNumber first_seqno
;
238 // Sequence number that is guaranteed to be smaller than or equal
239 // to the sequence number of any key that could be inserted into this
240 // memtable. It can then be assumed that any write with a larger(or equal)
241 // sequence number will be present in this memtable or a later memtable.
242 SequenceNumber earliest_seqno
;
243 // Total number of entries in memtable
244 uint64_t num_entries
;
245 // Total number of deletes in memtable
246 uint64_t num_deletes
;
249 struct ExternalFileIngestionInfo
{
250 // the name of the column family
252 // Path of the file outside the DB
253 std::string external_file_path
;
254 // Path of the file inside the DB
255 std::string internal_file_path
;
256 // The global sequence number assigned to keys in this file
257 SequenceNumber global_seqno
;
258 // Table properties of the table being flushed
259 TableProperties table_properties
;
262 // EventListener class contains a set of callback functions that will
263 // be called when specific RocksDB event happens such as flush. It can
264 // be used as a building block for developing custom features such as
265 // stats-collector or external compaction algorithm.
267 // Note that callback functions should not run for an extended period of
268 // time before the function returns, otherwise RocksDB may be blocked.
269 // For example, it is not suggested to do DB::CompactFiles() (as it may
270 // run for a long while) or issue many of DB::Put() (as Put may be blocked
271 // in certain cases) in the same thread in the EventListener callback.
272 // However, doing DB::CompactFiles() and DB::Put() in another thread is
275 // [Threading] All EventListener callback will be called using the
276 // actual thread that involves in that specific event. For example, it
277 // is the RocksDB background flush thread that does the actual flush to
278 // call EventListener::OnFlushCompleted().
280 // [Locking] All EventListener callbacks are designed to be called without
281 // the current thread holding any DB mutex. This is to prevent potential
282 // deadlock and performance issue when using EventListener callback
284 class EventListener
{
286 // A callback function to RocksDB which will be called whenever a
287 // registered RocksDB flushes a file. The default implementation is
290 // Note that the this function must be implemented in a way such that
291 // it should not run for an extended period of time before the function
292 // returns. Otherwise, RocksDB may be blocked.
293 virtual void OnFlushCompleted(DB
* /*db*/,
294 const FlushJobInfo
& /*flush_job_info*/) {}
296 // A callback function to RocksDB which will be called before a
297 // RocksDB starts to flush memtables. The default implementation is
300 // Note that the this function must be implemented in a way such that
301 // it should not run for an extended period of time before the function
302 // returns. Otherwise, RocksDB may be blocked.
303 virtual void OnFlushBegin(DB
* /*db*/,
304 const FlushJobInfo
& /*flush_job_info*/) {}
306 // A callback function for RocksDB which will be called whenever
307 // a SST file is deleted. Different from OnCompactionCompleted and
308 // OnFlushCompleted, this callback is designed for external logging
309 // service and thus only provide string parameters instead
310 // of a pointer to DB. Applications that build logic basic based
311 // on file creations and deletions is suggested to implement
312 // OnFlushCompleted and OnCompactionCompleted.
314 // Note that if applications would like to use the passed reference
315 // outside this function call, they should make copies from the
317 virtual void OnTableFileDeleted(const TableFileDeletionInfo
& /*info*/) {}
319 // A callback function to RocksDB which will be called before a
320 // RocksDB starts to compact. The default implementation is
323 // Note that the this function must be implemented in a way such that
324 // it should not run for an extended period of time before the function
325 // returns. Otherwise, RocksDB may be blocked.
326 virtual void OnCompactionBegin(DB
* /*db*/, const CompactionJobInfo
& /*ci*/) {}
328 // A callback function for RocksDB which will be called whenever
329 // a registered RocksDB compacts a file. The default implementation
332 // Note that this function must be implemented in a way such that
333 // it should not run for an extended period of time before the function
334 // returns. Otherwise, RocksDB may be blocked.
336 // @param db a pointer to the rocksdb instance which just compacted
338 // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
339 // after this function is returned, and must be copied if it is needed
340 // outside of this function.
341 virtual void OnCompactionCompleted(DB
* /*db*/,
342 const CompactionJobInfo
& /*ci*/) {}
344 // A callback function for RocksDB which will be called whenever
345 // a SST file is created. Different from OnCompactionCompleted and
346 // OnFlushCompleted, this callback is designed for external logging
347 // service and thus only provide string parameters instead
348 // of a pointer to DB. Applications that build logic basic based
349 // on file creations and deletions is suggested to implement
350 // OnFlushCompleted and OnCompactionCompleted.
352 // Historically it will only be called if the file is successfully created.
353 // Now it will also be called on failure case. User can check info.status
354 // to see if it succeeded or not.
356 // Note that if applications would like to use the passed reference
357 // outside this function call, they should make copies from these
359 virtual void OnTableFileCreated(const TableFileCreationInfo
& /*info*/) {}
361 // A callback function for RocksDB which will be called before
362 // a SST file is being created. It will follow by OnTableFileCreated after
363 // the creation finishes.
365 // Note that if applications would like to use the passed reference
366 // outside this function call, they should make copies from these
368 virtual void OnTableFileCreationStarted(
369 const TableFileCreationBriefInfo
& /*info*/) {}
371 // A callback function for RocksDB which will be called before
372 // a memtable is made immutable.
374 // Note that the this function must be implemented in a way such that
375 // it should not run for an extended period of time before the function
376 // returns. Otherwise, RocksDB may be blocked.
378 // Note that if applications would like to use the passed reference
379 // outside this function call, they should make copies from these
381 virtual void OnMemTableSealed(const MemTableInfo
& /*info*/) {}
383 // A callback function for RocksDB which will be called before
384 // a column family handle is deleted.
386 // Note that the this function must be implemented in a way such that
387 // it should not run for an extended period of time before the function
388 // returns. Otherwise, RocksDB may be blocked.
389 // @param handle is a pointer to the column family handle to be deleted
390 // which will become a dangling pointer after the deletion.
391 virtual void OnColumnFamilyHandleDeletionStarted(
392 ColumnFamilyHandle
* /*handle*/) {}
394 // A callback function for RocksDB which will be called after an external
395 // file is ingested using IngestExternalFile.
397 // Note that the this function will run on the same thread as
398 // IngestExternalFile(), if this function is blocked, IngestExternalFile()
399 // will be blocked from finishing.
400 virtual void OnExternalFileIngested(
401 DB
* /*db*/, const ExternalFileIngestionInfo
& /*info*/) {}
403 // A callback function for RocksDB which will be called before setting the
404 // background error status to a non-OK value. The new background error status
405 // is provided in `bg_error` and can be modified by the callback. E.g., a
406 // callback can suppress errors by resetting it to Status::OK(), thus
407 // preventing the database from entering read-only mode. We do not provide any
408 // guarantee when failed flushes/compactions will be rescheduled if the user
409 // suppresses an error.
411 // Note that this function can run on the same threads as flush, compaction,
412 // and user writes. So, it is extremely important not to perform heavy
413 // computations or blocking calls in this function.
414 virtual void OnBackgroundError(BackgroundErrorReason
/* reason */,
415 Status
* /* bg_error */) {}
417 // A callback function for RocksDB which will be called whenever a change
418 // of superversion triggers a change of the stall conditions.
420 // Note that the this function must be implemented in a way such that
421 // it should not run for an extended period of time before the function
422 // returns. Otherwise, RocksDB may be blocked.
423 virtual void OnStallConditionsChanged(const WriteStallInfo
& /*info*/) {}
425 // A callback function for RocksDB which will be called whenever a file read
426 // operation finishes.
427 virtual void OnFileReadFinish(const FileOperationInfo
& /* info */) {}
429 // A callback function for RocksDB which will be called whenever a file write
430 // operation finishes.
431 virtual void OnFileWriteFinish(const FileOperationInfo
& /* info */) {}
433 // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
434 // false, then they won't be called.
435 virtual bool ShouldBeNotifiedOnFileIO() { return false; }
437 // A callback function for RocksDB which will be called just before
438 // starting the automatic recovery process for recoverable background
439 // errors, such as NoSpace(). The callback can suppress the automatic
440 // recovery by setting *auto_recovery to false. The database will then
441 // have to be transitioned out of read-only mode by calling DB::Resume()
442 virtual void OnErrorRecoveryBegin(BackgroundErrorReason
/* reason */,
443 Status
/* bg_error */,
444 bool* /* auto_recovery */) {}
446 // A callback function for RocksDB which will be called once the database
447 // is recovered from read-only mode after an error. When this is called, it
448 // means normal writes to the database can be issued and the user can
449 // initiate any further recovery actions needed
450 virtual void OnErrorRecoveryCompleted(Status
/* old_bg_error */) {}
452 virtual ~EventListener() {}
457 class EventListener
{};
459 #endif // ROCKSDB_LITE
461 } // namespace rocksdb