1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
12 #include <unordered_map>
15 #include "rocksdb/compaction_job_stats.h"
16 #include "rocksdb/compression_type.h"
17 #include "rocksdb/status.h"
18 #include "rocksdb/table_properties.h"
20 namespace ROCKSDB_NAMESPACE
{
22 typedef std::unordered_map
<std::string
, std::shared_ptr
<const TableProperties
>>
23 TablePropertiesCollection
;
26 class ColumnFamilyHandle
;
28 struct CompactionJobStats
;
30 enum class TableFileCreationReason
{
37 struct TableFileCreationBriefInfo
{
38 // the name of the database where the file was created
40 // the name of the column family where the file was created.
42 // the path to the created file.
43 std::string file_path
;
44 // the id of the job (which could be flush or compaction) that
47 // reason of creating the table.
48 TableFileCreationReason reason
;
51 struct TableFileCreationInfo
: public TableFileCreationBriefInfo
{
52 TableFileCreationInfo() = default;
53 explicit TableFileCreationInfo(TableProperties
&& prop
)
54 : table_properties(prop
) {}
55 // the size of the file.
57 // Detailed properties of the created file.
58 TableProperties table_properties
;
59 // The status indicating whether the creation was successful or not.
61 // The checksum of the table file being created
62 std::string file_checksum
;
63 // The checksum function name of checksum generator used for this table file
64 std::string file_checksum_func_name
;
67 enum class CompactionReason
: int {
69 // [Level] number of L0 files > level0_file_num_compaction_trigger
71 // [Level] total size of level > MaxBytesForLevel()
73 // [Universal] Compacting for size amplification
74 kUniversalSizeAmplification
,
75 // [Universal] Compacting for size ratio
77 // [Universal] number of sorted runs > level0_file_num_compaction_trigger
78 kUniversalSortedRunNum
,
79 // [FIFO] total size > max_table_files_size
81 // [FIFO] reduce number of files.
83 // [FIFO] files with creation time < (current_time - interval)
87 // DB::SuggestCompactRange() marked files for compaction
88 kFilesMarkedForCompaction
,
89 // [Level] Automatic compaction within bottommost level to cleanup duplicate
90 // versions of same user key, usually due to a released snapshot.
92 // Compaction based on TTL
94 // According to the comments in flush_job.cc, RocksDB treats flush as
95 // a level 0 compaction in internal stats.
97 // Compaction caused by external sst file ingestion
98 kExternalSstIngestion
,
99 // Compaction due to SST file being too old
101 // total number of compaction reasons, new reasons must be added above this.
105 enum class FlushReason
: int {
107 kGetLiveFiles
= 0x01,
109 kExternalFileIngestion
= 0x03,
110 kManualCompaction
= 0x04,
111 kWriteBufferManager
= 0x05,
112 kWriteBufferFull
= 0x06,
115 kAutoCompaction
= 0x09,
117 kErrorRecovery
= 0xb,
118 // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
119 // will not be called to avoid many small immutable memtables.
120 kErrorRecoveryRetryFlush
= 0xc,
123 enum class BackgroundErrorReason
{
132 enum class WriteStallCondition
{
138 struct WriteStallInfo
{
139 // the name of the column family
141 // state of the write controller
143 WriteStallCondition cur
;
144 WriteStallCondition prev
;
150 struct TableFileDeletionInfo
{
151 // The name of the database where the file was deleted.
153 // The path to the deleted file.
154 std::string file_path
;
155 // The id of the job which deleted the file.
157 // The status indicating whether the deletion was successful or not.
161 enum class FileOperationType
{
172 struct FileOperationInfo
{
173 using Duration
= std::chrono::nanoseconds
;
174 using SteadyTimePoint
=
175 std::chrono::time_point
<std::chrono::steady_clock
, Duration
>;
176 using SystemTimePoint
=
177 std::chrono::time_point
<std::chrono::system_clock
, Duration
>;
178 using StartTimePoint
= std::pair
<SystemTimePoint
, SteadyTimePoint
>;
179 using FinishTimePoint
= SteadyTimePoint
;
181 FileOperationType type
;
182 const std::string
& path
;
185 const Duration duration
;
186 const SystemTimePoint
& start_ts
;
188 FileOperationInfo(const FileOperationType _type
, const std::string
& _path
,
189 const StartTimePoint
& _start_ts
,
190 const FinishTimePoint
& _finish_ts
, const Status
& _status
)
193 duration(std::chrono::duration_cast
<std::chrono::nanoseconds
>(
194 _finish_ts
- _start_ts
.second
)),
195 start_ts(_start_ts
.first
),
197 static StartTimePoint
StartNow() {
198 return std::make_pair
<SystemTimePoint
, SteadyTimePoint
>(
199 std::chrono::system_clock::now(), std::chrono::steady_clock::now());
201 static FinishTimePoint
FinishNow() {
202 return std::chrono::steady_clock::now();
206 struct FlushJobInfo
{
207 // the id of the column family
209 // the name of the column family
211 // the path to the newly created file
212 std::string file_path
;
213 // the file number of the newly created file
214 uint64_t file_number
;
215 // the oldest blob file referenced by the newly created file
216 uint64_t oldest_blob_file_number
;
217 // the id of the thread that completed this flush job.
219 // the job id, which is unique in the same thread.
221 // If true, then rocksdb is currently slowing-down all writes to prevent
222 // creating too many Level 0 files as compaction seems not able to
223 // catch up the write request speed. This indicates that there are
224 // too many files in Level 0.
225 bool triggered_writes_slowdown
;
226 // If true, then rocksdb is currently blocking any writes to prevent
227 // creating more L0 files. This indicates that there are too many
228 // files in level 0. Compactions should try to compact L0 files down
229 // to lower levels as soon as possible.
230 bool triggered_writes_stop
;
231 // The smallest sequence number in the newly created file
232 SequenceNumber smallest_seqno
;
233 // The largest sequence number in the newly created file
234 SequenceNumber largest_seqno
;
235 // Table properties of the table being flushed
236 TableProperties table_properties
;
238 FlushReason flush_reason
;
241 struct CompactionFileInfo
{
242 // The level of the file.
245 // The file number of the file.
246 uint64_t file_number
;
248 // The file number of the oldest blob file this SST file references.
249 uint64_t oldest_blob_file_number
;
252 struct CompactionJobInfo
{
253 ~CompactionJobInfo() { status
.PermitUncheckedError(); }
254 // the id of the column family where the compaction happened.
256 // the name of the column family where the compaction happened.
258 // the status indicating whether the compaction was successful or not.
260 // the id of the thread that completed this compaction job.
262 // the job id, which is unique in the same thread.
264 // the smallest input level of the compaction.
265 int base_input_level
;
266 // the output level of the compaction.
269 // The following variables contain information about compaction inputs
270 // and outputs. A file may appear in both the input and output lists
271 // if it was simply moved to a different level. The order of elements
272 // is the same across input_files and input_file_infos; similarly, it is
273 // the same across output_files and output_file_infos.
275 // The names of the compaction input files.
276 std::vector
<std::string
> input_files
;
278 // Additional information about the compaction input files.
279 std::vector
<CompactionFileInfo
> input_file_infos
;
281 // The names of the compaction output files.
282 std::vector
<std::string
> output_files
;
284 // Additional information about the compaction output files.
285 std::vector
<CompactionFileInfo
> output_file_infos
;
287 // Table properties for input and output tables.
288 // The map is keyed by values from input_files and output_files.
289 TablePropertiesCollection table_properties
;
291 // Reason to run the compaction
292 CompactionReason compaction_reason
;
294 // Compression algorithm used for output files
295 CompressionType compression
;
297 // Statistics and other additional details on the compaction
298 CompactionJobStats stats
;
301 struct MemTableInfo
{
302 // the name of the column family to which memtable belongs
304 // Sequence number of the first element that was inserted
305 // into the memtable.
306 SequenceNumber first_seqno
;
307 // Sequence number that is guaranteed to be smaller than or equal
308 // to the sequence number of any key that could be inserted into this
309 // memtable. It can then be assumed that any write with a larger(or equal)
310 // sequence number will be present in this memtable or a later memtable.
311 SequenceNumber earliest_seqno
;
312 // Total number of entries in memtable
313 uint64_t num_entries
;
314 // Total number of deletes in memtable
315 uint64_t num_deletes
;
318 struct ExternalFileIngestionInfo
{
319 // the name of the column family
321 // Path of the file outside the DB
322 std::string external_file_path
;
323 // Path of the file inside the DB
324 std::string internal_file_path
;
325 // The global sequence number assigned to keys in this file
326 SequenceNumber global_seqno
;
327 // Table properties of the table being flushed
328 TableProperties table_properties
;
331 // EventListener class contains a set of callback functions that will
332 // be called when specific RocksDB event happens such as flush. It can
333 // be used as a building block for developing custom features such as
334 // stats-collector or external compaction algorithm.
336 // Note that callback functions should not run for an extended period of
337 // time before the function returns, otherwise RocksDB may be blocked.
338 // For example, it is not suggested to do DB::CompactFiles() (as it may
339 // run for a long while) or issue many of DB::Put() (as Put may be blocked
340 // in certain cases) in the same thread in the EventListener callback.
341 // However, doing DB::CompactFiles() and DB::Put() in another thread is
344 // [Threading] All EventListener callback will be called using the
345 // actual thread that involves in that specific event. For example, it
346 // is the RocksDB background flush thread that does the actual flush to
347 // call EventListener::OnFlushCompleted().
349 // [Locking] All EventListener callbacks are designed to be called without
350 // the current thread holding any DB mutex. This is to prevent potential
351 // deadlock and performance issue when using EventListener callback
353 class EventListener
{
355 // A callback function to RocksDB which will be called whenever a
356 // registered RocksDB flushes a file. The default implementation is
359 // Note that the this function must be implemented in a way such that
360 // it should not run for an extended period of time before the function
361 // returns. Otherwise, RocksDB may be blocked.
362 virtual void OnFlushCompleted(DB
* /*db*/,
363 const FlushJobInfo
& /*flush_job_info*/) {}
365 // A callback function to RocksDB which will be called before a
366 // RocksDB starts to flush memtables. The default implementation is
369 // Note that the this function must be implemented in a way such that
370 // it should not run for an extended period of time before the function
371 // returns. Otherwise, RocksDB may be blocked.
372 virtual void OnFlushBegin(DB
* /*db*/,
373 const FlushJobInfo
& /*flush_job_info*/) {}
375 // A callback function for RocksDB which will be called whenever
376 // a SST file is deleted. Different from OnCompactionCompleted and
377 // OnFlushCompleted, this callback is designed for external logging
378 // service and thus only provide string parameters instead
379 // of a pointer to DB. Applications that build logic basic based
380 // on file creations and deletions is suggested to implement
381 // OnFlushCompleted and OnCompactionCompleted.
383 // Note that if applications would like to use the passed reference
384 // outside this function call, they should make copies from the
386 virtual void OnTableFileDeleted(const TableFileDeletionInfo
& /*info*/) {}
388 // A callback function to RocksDB which will be called before a
389 // RocksDB starts to compact. The default implementation is
392 // Note that the this function must be implemented in a way such that
393 // it should not run for an extended period of time before the function
394 // returns. Otherwise, RocksDB may be blocked.
395 virtual void OnCompactionBegin(DB
* /*db*/, const CompactionJobInfo
& /*ci*/) {}
397 // A callback function for RocksDB which will be called whenever
398 // a registered RocksDB compacts a file. The default implementation
401 // Note that this function must be implemented in a way such that
402 // it should not run for an extended period of time before the function
403 // returns. Otherwise, RocksDB may be blocked.
405 // @param db a pointer to the rocksdb instance which just compacted
407 // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
408 // after this function is returned, and must be copied if it is needed
409 // outside of this function.
410 virtual void OnCompactionCompleted(DB
* /*db*/,
411 const CompactionJobInfo
& /*ci*/) {}
413 // A callback function for RocksDB which will be called whenever
414 // a SST file is created. Different from OnCompactionCompleted and
415 // OnFlushCompleted, this callback is designed for external logging
416 // service and thus only provide string parameters instead
417 // of a pointer to DB. Applications that build logic basic based
418 // on file creations and deletions is suggested to implement
419 // OnFlushCompleted and OnCompactionCompleted.
421 // Historically it will only be called if the file is successfully created.
422 // Now it will also be called on failure case. User can check info.status
423 // to see if it succeeded or not.
425 // Note that if applications would like to use the passed reference
426 // outside this function call, they should make copies from these
428 virtual void OnTableFileCreated(const TableFileCreationInfo
& /*info*/) {}
430 // A callback function for RocksDB which will be called before
431 // a SST file is being created. It will follow by OnTableFileCreated after
432 // the creation finishes.
434 // Note that if applications would like to use the passed reference
435 // outside this function call, they should make copies from these
437 virtual void OnTableFileCreationStarted(
438 const TableFileCreationBriefInfo
& /*info*/) {}
440 // A callback function for RocksDB which will be called before
441 // a memtable is made immutable.
443 // Note that the this function must be implemented in a way such that
444 // it should not run for an extended period of time before the function
445 // returns. Otherwise, RocksDB may be blocked.
447 // Note that if applications would like to use the passed reference
448 // outside this function call, they should make copies from these
450 virtual void OnMemTableSealed(const MemTableInfo
& /*info*/) {}
452 // A callback function for RocksDB which will be called before
453 // a column family handle is deleted.
455 // Note that the this function must be implemented in a way such that
456 // it should not run for an extended period of time before the function
457 // returns. Otherwise, RocksDB may be blocked.
458 // @param handle is a pointer to the column family handle to be deleted
459 // which will become a dangling pointer after the deletion.
460 virtual void OnColumnFamilyHandleDeletionStarted(
461 ColumnFamilyHandle
* /*handle*/) {}
463 // A callback function for RocksDB which will be called after an external
464 // file is ingested using IngestExternalFile.
466 // Note that the this function will run on the same thread as
467 // IngestExternalFile(), if this function is blocked, IngestExternalFile()
468 // will be blocked from finishing.
469 virtual void OnExternalFileIngested(
470 DB
* /*db*/, const ExternalFileIngestionInfo
& /*info*/) {}
472 // A callback function for RocksDB which will be called before setting the
473 // background error status to a non-OK value. The new background error status
474 // is provided in `bg_error` and can be modified by the callback. E.g., a
475 // callback can suppress errors by resetting it to Status::OK(), thus
476 // preventing the database from entering read-only mode. We do not provide any
477 // guarantee when failed flushes/compactions will be rescheduled if the user
478 // suppresses an error.
480 // Note that this function can run on the same threads as flush, compaction,
481 // and user writes. So, it is extremely important not to perform heavy
482 // computations or blocking calls in this function.
483 virtual void OnBackgroundError(BackgroundErrorReason
/* reason */,
484 Status
* /* bg_error */) {}
486 // A callback function for RocksDB which will be called whenever a change
487 // of superversion triggers a change of the stall conditions.
489 // Note that the this function must be implemented in a way such that
490 // it should not run for an extended period of time before the function
491 // returns. Otherwise, RocksDB may be blocked.
492 virtual void OnStallConditionsChanged(const WriteStallInfo
& /*info*/) {}
494 // A callback function for RocksDB which will be called whenever a file read
495 // operation finishes.
496 virtual void OnFileReadFinish(const FileOperationInfo
& /* info */) {}
498 // A callback function for RocksDB which will be called whenever a file write
499 // operation finishes.
500 virtual void OnFileWriteFinish(const FileOperationInfo
& /* info */) {}
502 // A callback function for RocksDB which will be called whenever a file flush
503 // operation finishes.
504 virtual void OnFileFlushFinish(const FileOperationInfo
& /* info */) {}
506 // A callback function for RocksDB which will be called whenever a file sync
507 // operation finishes.
508 virtual void OnFileSyncFinish(const FileOperationInfo
& /* info */) {}
510 // A callback function for RocksDB which will be called whenever a file
511 // rangeSync operation finishes.
512 virtual void OnFileRangeSyncFinish(const FileOperationInfo
& /* info */) {}
514 // A callback function for RocksDB which will be called whenever a file
515 // truncate operation finishes.
516 virtual void OnFileTruncateFinish(const FileOperationInfo
& /* info */) {}
518 // A callback function for RocksDB which will be called whenever a file close
519 // operation finishes.
520 virtual void OnFileCloseFinish(const FileOperationInfo
& /* info */) {}
522 // If true, the OnFile*Finish functions will be called. If
523 // false, then they won't be called.
524 virtual bool ShouldBeNotifiedOnFileIO() { return false; }
526 // A callback function for RocksDB which will be called just before
527 // starting the automatic recovery process for recoverable background
528 // errors, such as NoSpace(). The callback can suppress the automatic
529 // recovery by setting *auto_recovery to false. The database will then
530 // have to be transitioned out of read-only mode by calling DB::Resume()
531 virtual void OnErrorRecoveryBegin(BackgroundErrorReason
/* reason */,
532 Status
/* bg_error */,
533 bool* /* auto_recovery */) {}
535 // A callback function for RocksDB which will be called once the database
536 // is recovered from read-only mode after an error. When this is called, it
537 // means normal writes to the database can be issued and the user can
538 // initiate any further recovery actions needed
539 virtual void OnErrorRecoveryCompleted(Status
/* old_bg_error */) {}
541 virtual ~EventListener() {}
546 class EventListener
{};
547 struct FlushJobInfo
{};
549 #endif // ROCKSDB_LITE
551 } // namespace ROCKSDB_NAMESPACE