]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/listener.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / include / rocksdb / listener.h
1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4 //
5 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
6
7 #pragma once
8
9 #include <chrono>
10 #include <memory>
11 #include <string>
12 #include <unordered_map>
13 #include <vector>
14
15 #include "rocksdb/compaction_job_stats.h"
16 #include "rocksdb/compression_type.h"
17 #include "rocksdb/status.h"
18 #include "rocksdb/table_properties.h"
19
20 namespace ROCKSDB_NAMESPACE {
21
22 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
23 TablePropertiesCollection;
24
25 class DB;
26 class ColumnFamilyHandle;
27 class Status;
28 struct CompactionJobStats;
29
30 enum class TableFileCreationReason {
31 kFlush,
32 kCompaction,
33 kRecovery,
34 kMisc,
35 };
36
37 struct TableFileCreationBriefInfo {
38 // the name of the database where the file was created
39 std::string db_name;
40 // the name of the column family where the file was created.
41 std::string cf_name;
42 // the path to the created file.
43 std::string file_path;
44 // the id of the job (which could be flush or compaction) that
45 // created the file.
46 int job_id;
47 // reason of creating the table.
48 TableFileCreationReason reason;
49 };
50
51 struct TableFileCreationInfo : public TableFileCreationBriefInfo {
52 TableFileCreationInfo() = default;
53 explicit TableFileCreationInfo(TableProperties&& prop)
54 : table_properties(prop) {}
55 // the size of the file.
56 uint64_t file_size;
57 // Detailed properties of the created file.
58 TableProperties table_properties;
59 // The status indicating whether the creation was successful or not.
60 Status status;
61 // The checksum of the table file being created
62 std::string file_checksum;
63 // The checksum function name of checksum generator used for this table file
64 std::string file_checksum_func_name;
65 };
66
67 enum class CompactionReason : int {
68 kUnknown = 0,
69 // [Level] number of L0 files > level0_file_num_compaction_trigger
70 kLevelL0FilesNum,
71 // [Level] total size of level > MaxBytesForLevel()
72 kLevelMaxLevelSize,
73 // [Universal] Compacting for size amplification
74 kUniversalSizeAmplification,
75 // [Universal] Compacting for size ratio
76 kUniversalSizeRatio,
77 // [Universal] number of sorted runs > level0_file_num_compaction_trigger
78 kUniversalSortedRunNum,
79 // [FIFO] total size > max_table_files_size
80 kFIFOMaxSize,
81 // [FIFO] reduce number of files.
82 kFIFOReduceNumFiles,
83 // [FIFO] files with creation time < (current_time - interval)
84 kFIFOTtl,
85 // Manual compaction
86 kManualCompaction,
87 // DB::SuggestCompactRange() marked files for compaction
88 kFilesMarkedForCompaction,
89 // [Level] Automatic compaction within bottommost level to cleanup duplicate
90 // versions of same user key, usually due to a released snapshot.
91 kBottommostFiles,
92 // Compaction based on TTL
93 kTtl,
94 // According to the comments in flush_job.cc, RocksDB treats flush as
95 // a level 0 compaction in internal stats.
96 kFlush,
97 // Compaction caused by external sst file ingestion
98 kExternalSstIngestion,
99 // Compaction due to SST file being too old
100 kPeriodicCompaction,
101 // total number of compaction reasons, new reasons must be added above this.
102 kNumOfReasons,
103 };
104
105 enum class FlushReason : int {
106 kOthers = 0x00,
107 kGetLiveFiles = 0x01,
108 kShutDown = 0x02,
109 kExternalFileIngestion = 0x03,
110 kManualCompaction = 0x04,
111 kWriteBufferManager = 0x05,
112 kWriteBufferFull = 0x06,
113 kTest = 0x07,
114 kDeleteFiles = 0x08,
115 kAutoCompaction = 0x09,
116 kManualFlush = 0x0a,
117 kErrorRecovery = 0xb,
118 // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
119 // will not be called to avoid many small immutable memtables.
120 kErrorRecoveryRetryFlush = 0xc,
121 };
122
123 enum class BackgroundErrorReason {
124 kFlush,
125 kCompaction,
126 kWriteCallback,
127 kMemTable,
128 kManifestWrite,
129 kFlushNoWAL,
130 };
131
132 enum class WriteStallCondition {
133 kNormal,
134 kDelayed,
135 kStopped,
136 };
137
138 struct WriteStallInfo {
139 // the name of the column family
140 std::string cf_name;
141 // state of the write controller
142 struct {
143 WriteStallCondition cur;
144 WriteStallCondition prev;
145 } condition;
146 };
147
148 #ifndef ROCKSDB_LITE
149
150 struct TableFileDeletionInfo {
151 // The name of the database where the file was deleted.
152 std::string db_name;
153 // The path to the deleted file.
154 std::string file_path;
155 // The id of the job which deleted the file.
156 int job_id;
157 // The status indicating whether the deletion was successful or not.
158 Status status;
159 };
160
161 enum class FileOperationType {
162 kRead,
163 kWrite,
164 kTruncate,
165 kClose,
166 kFlush,
167 kSync,
168 kFsync,
169 kRangeSync
170 };
171
172 struct FileOperationInfo {
173 using Duration = std::chrono::nanoseconds;
174 using SteadyTimePoint =
175 std::chrono::time_point<std::chrono::steady_clock, Duration>;
176 using SystemTimePoint =
177 std::chrono::time_point<std::chrono::system_clock, Duration>;
178 using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
179 using FinishTimePoint = SteadyTimePoint;
180
181 FileOperationType type;
182 const std::string& path;
183 uint64_t offset;
184 size_t length;
185 const Duration duration;
186 const SystemTimePoint& start_ts;
187 Status status;
188 FileOperationInfo(const FileOperationType _type, const std::string& _path,
189 const StartTimePoint& _start_ts,
190 const FinishTimePoint& _finish_ts, const Status& _status)
191 : type(_type),
192 path(_path),
193 duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
194 _finish_ts - _start_ts.second)),
195 start_ts(_start_ts.first),
196 status(_status) {}
197 static StartTimePoint StartNow() {
198 return std::make_pair<SystemTimePoint, SteadyTimePoint>(
199 std::chrono::system_clock::now(), std::chrono::steady_clock::now());
200 }
201 static FinishTimePoint FinishNow() {
202 return std::chrono::steady_clock::now();
203 }
204 };
205
206 struct FlushJobInfo {
207 // the id of the column family
208 uint32_t cf_id;
209 // the name of the column family
210 std::string cf_name;
211 // the path to the newly created file
212 std::string file_path;
213 // the file number of the newly created file
214 uint64_t file_number;
215 // the oldest blob file referenced by the newly created file
216 uint64_t oldest_blob_file_number;
217 // the id of the thread that completed this flush job.
218 uint64_t thread_id;
219 // the job id, which is unique in the same thread.
220 int job_id;
221 // If true, then rocksdb is currently slowing-down all writes to prevent
222 // creating too many Level 0 files as compaction seems not able to
223 // catch up the write request speed. This indicates that there are
224 // too many files in Level 0.
225 bool triggered_writes_slowdown;
226 // If true, then rocksdb is currently blocking any writes to prevent
227 // creating more L0 files. This indicates that there are too many
228 // files in level 0. Compactions should try to compact L0 files down
229 // to lower levels as soon as possible.
230 bool triggered_writes_stop;
231 // The smallest sequence number in the newly created file
232 SequenceNumber smallest_seqno;
233 // The largest sequence number in the newly created file
234 SequenceNumber largest_seqno;
235 // Table properties of the table being flushed
236 TableProperties table_properties;
237
238 FlushReason flush_reason;
239 };
240
241 struct CompactionFileInfo {
242 // The level of the file.
243 int level;
244
245 // The file number of the file.
246 uint64_t file_number;
247
248 // The file number of the oldest blob file this SST file references.
249 uint64_t oldest_blob_file_number;
250 };
251
252 struct CompactionJobInfo {
253 ~CompactionJobInfo() { status.PermitUncheckedError(); }
254 // the id of the column family where the compaction happened.
255 uint32_t cf_id;
256 // the name of the column family where the compaction happened.
257 std::string cf_name;
258 // the status indicating whether the compaction was successful or not.
259 Status status;
260 // the id of the thread that completed this compaction job.
261 uint64_t thread_id;
262 // the job id, which is unique in the same thread.
263 int job_id;
264 // the smallest input level of the compaction.
265 int base_input_level;
266 // the output level of the compaction.
267 int output_level;
268
269 // The following variables contain information about compaction inputs
270 // and outputs. A file may appear in both the input and output lists
271 // if it was simply moved to a different level. The order of elements
272 // is the same across input_files and input_file_infos; similarly, it is
273 // the same across output_files and output_file_infos.
274
275 // The names of the compaction input files.
276 std::vector<std::string> input_files;
277
278 // Additional information about the compaction input files.
279 std::vector<CompactionFileInfo> input_file_infos;
280
281 // The names of the compaction output files.
282 std::vector<std::string> output_files;
283
284 // Additional information about the compaction output files.
285 std::vector<CompactionFileInfo> output_file_infos;
286
287 // Table properties for input and output tables.
288 // The map is keyed by values from input_files and output_files.
289 TablePropertiesCollection table_properties;
290
291 // Reason to run the compaction
292 CompactionReason compaction_reason;
293
294 // Compression algorithm used for output files
295 CompressionType compression;
296
297 // Statistics and other additional details on the compaction
298 CompactionJobStats stats;
299 };
300
301 struct MemTableInfo {
302 // the name of the column family to which memtable belongs
303 std::string cf_name;
304 // Sequence number of the first element that was inserted
305 // into the memtable.
306 SequenceNumber first_seqno;
307 // Sequence number that is guaranteed to be smaller than or equal
308 // to the sequence number of any key that could be inserted into this
309 // memtable. It can then be assumed that any write with a larger(or equal)
310 // sequence number will be present in this memtable or a later memtable.
311 SequenceNumber earliest_seqno;
312 // Total number of entries in memtable
313 uint64_t num_entries;
314 // Total number of deletes in memtable
315 uint64_t num_deletes;
316 };
317
318 struct ExternalFileIngestionInfo {
319 // the name of the column family
320 std::string cf_name;
321 // Path of the file outside the DB
322 std::string external_file_path;
323 // Path of the file inside the DB
324 std::string internal_file_path;
325 // The global sequence number assigned to keys in this file
326 SequenceNumber global_seqno;
327 // Table properties of the table being flushed
328 TableProperties table_properties;
329 };
330
331 // EventListener class contains a set of callback functions that will
332 // be called when specific RocksDB event happens such as flush. It can
333 // be used as a building block for developing custom features such as
334 // stats-collector or external compaction algorithm.
335 //
336 // Note that callback functions should not run for an extended period of
337 // time before the function returns, otherwise RocksDB may be blocked.
338 // For example, it is not suggested to do DB::CompactFiles() (as it may
339 // run for a long while) or issue many of DB::Put() (as Put may be blocked
340 // in certain cases) in the same thread in the EventListener callback.
341 // However, doing DB::CompactFiles() and DB::Put() in another thread is
342 // considered safe.
343 //
344 // [Threading] All EventListener callback will be called using the
345 // actual thread that involves in that specific event. For example, it
346 // is the RocksDB background flush thread that does the actual flush to
347 // call EventListener::OnFlushCompleted().
348 //
349 // [Locking] All EventListener callbacks are designed to be called without
350 // the current thread holding any DB mutex. This is to prevent potential
351 // deadlock and performance issue when using EventListener callback
352 // in a complex way.
353 class EventListener {
354 public:
355 // A callback function to RocksDB which will be called whenever a
356 // registered RocksDB flushes a file. The default implementation is
357 // no-op.
358 //
359 // Note that the this function must be implemented in a way such that
360 // it should not run for an extended period of time before the function
361 // returns. Otherwise, RocksDB may be blocked.
362 virtual void OnFlushCompleted(DB* /*db*/,
363 const FlushJobInfo& /*flush_job_info*/) {}
364
365 // A callback function to RocksDB which will be called before a
366 // RocksDB starts to flush memtables. The default implementation is
367 // no-op.
368 //
369 // Note that the this function must be implemented in a way such that
370 // it should not run for an extended period of time before the function
371 // returns. Otherwise, RocksDB may be blocked.
372 virtual void OnFlushBegin(DB* /*db*/,
373 const FlushJobInfo& /*flush_job_info*/) {}
374
375 // A callback function for RocksDB which will be called whenever
376 // a SST file is deleted. Different from OnCompactionCompleted and
377 // OnFlushCompleted, this callback is designed for external logging
378 // service and thus only provide string parameters instead
379 // of a pointer to DB. Applications that build logic basic based
380 // on file creations and deletions is suggested to implement
381 // OnFlushCompleted and OnCompactionCompleted.
382 //
383 // Note that if applications would like to use the passed reference
384 // outside this function call, they should make copies from the
385 // returned value.
386 virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
387
388 // A callback function to RocksDB which will be called before a
389 // RocksDB starts to compact. The default implementation is
390 // no-op.
391 //
392 // Note that the this function must be implemented in a way such that
393 // it should not run for an extended period of time before the function
394 // returns. Otherwise, RocksDB may be blocked.
395 virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
396
397 // A callback function for RocksDB which will be called whenever
398 // a registered RocksDB compacts a file. The default implementation
399 // is a no-op.
400 //
401 // Note that this function must be implemented in a way such that
402 // it should not run for an extended period of time before the function
403 // returns. Otherwise, RocksDB may be blocked.
404 //
405 // @param db a pointer to the rocksdb instance which just compacted
406 // a file.
407 // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
408 // after this function is returned, and must be copied if it is needed
409 // outside of this function.
410 virtual void OnCompactionCompleted(DB* /*db*/,
411 const CompactionJobInfo& /*ci*/) {}
412
413 // A callback function for RocksDB which will be called whenever
414 // a SST file is created. Different from OnCompactionCompleted and
415 // OnFlushCompleted, this callback is designed for external logging
416 // service and thus only provide string parameters instead
417 // of a pointer to DB. Applications that build logic basic based
418 // on file creations and deletions is suggested to implement
419 // OnFlushCompleted and OnCompactionCompleted.
420 //
421 // Historically it will only be called if the file is successfully created.
422 // Now it will also be called on failure case. User can check info.status
423 // to see if it succeeded or not.
424 //
425 // Note that if applications would like to use the passed reference
426 // outside this function call, they should make copies from these
427 // returned value.
428 virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
429
430 // A callback function for RocksDB which will be called before
431 // a SST file is being created. It will follow by OnTableFileCreated after
432 // the creation finishes.
433 //
434 // Note that if applications would like to use the passed reference
435 // outside this function call, they should make copies from these
436 // returned value.
437 virtual void OnTableFileCreationStarted(
438 const TableFileCreationBriefInfo& /*info*/) {}
439
440 // A callback function for RocksDB which will be called before
441 // a memtable is made immutable.
442 //
443 // Note that the this function must be implemented in a way such that
444 // it should not run for an extended period of time before the function
445 // returns. Otherwise, RocksDB may be blocked.
446 //
447 // Note that if applications would like to use the passed reference
448 // outside this function call, they should make copies from these
449 // returned value.
450 virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
451
452 // A callback function for RocksDB which will be called before
453 // a column family handle is deleted.
454 //
455 // Note that the this function must be implemented in a way such that
456 // it should not run for an extended period of time before the function
457 // returns. Otherwise, RocksDB may be blocked.
458 // @param handle is a pointer to the column family handle to be deleted
459 // which will become a dangling pointer after the deletion.
460 virtual void OnColumnFamilyHandleDeletionStarted(
461 ColumnFamilyHandle* /*handle*/) {}
462
463 // A callback function for RocksDB which will be called after an external
464 // file is ingested using IngestExternalFile.
465 //
466 // Note that the this function will run on the same thread as
467 // IngestExternalFile(), if this function is blocked, IngestExternalFile()
468 // will be blocked from finishing.
469 virtual void OnExternalFileIngested(
470 DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
471
472 // A callback function for RocksDB which will be called before setting the
473 // background error status to a non-OK value. The new background error status
474 // is provided in `bg_error` and can be modified by the callback. E.g., a
475 // callback can suppress errors by resetting it to Status::OK(), thus
476 // preventing the database from entering read-only mode. We do not provide any
477 // guarantee when failed flushes/compactions will be rescheduled if the user
478 // suppresses an error.
479 //
480 // Note that this function can run on the same threads as flush, compaction,
481 // and user writes. So, it is extremely important not to perform heavy
482 // computations or blocking calls in this function.
483 virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
484 Status* /* bg_error */) {}
485
486 // A callback function for RocksDB which will be called whenever a change
487 // of superversion triggers a change of the stall conditions.
488 //
489 // Note that the this function must be implemented in a way such that
490 // it should not run for an extended period of time before the function
491 // returns. Otherwise, RocksDB may be blocked.
492 virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
493
494 // A callback function for RocksDB which will be called whenever a file read
495 // operation finishes.
496 virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
497
498 // A callback function for RocksDB which will be called whenever a file write
499 // operation finishes.
500 virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
501
502 // A callback function for RocksDB which will be called whenever a file flush
503 // operation finishes.
504 virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
505
506 // A callback function for RocksDB which will be called whenever a file sync
507 // operation finishes.
508 virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
509
510 // A callback function for RocksDB which will be called whenever a file
511 // rangeSync operation finishes.
512 virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
513
514 // A callback function for RocksDB which will be called whenever a file
515 // truncate operation finishes.
516 virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
517
518 // A callback function for RocksDB which will be called whenever a file close
519 // operation finishes.
520 virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
521
522 // If true, the OnFile*Finish functions will be called. If
523 // false, then they won't be called.
524 virtual bool ShouldBeNotifiedOnFileIO() { return false; }
525
526 // A callback function for RocksDB which will be called just before
527 // starting the automatic recovery process for recoverable background
528 // errors, such as NoSpace(). The callback can suppress the automatic
529 // recovery by setting *auto_recovery to false. The database will then
530 // have to be transitioned out of read-only mode by calling DB::Resume()
531 virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
532 Status /* bg_error */,
533 bool* /* auto_recovery */) {}
534
535 // A callback function for RocksDB which will be called once the database
536 // is recovered from read-only mode after an error. When this is called, it
537 // means normal writes to the database can be issued and the user can
538 // initiate any further recovery actions needed
539 virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
540
541 virtual ~EventListener() {}
542 };
543
544 #else
545
546 class EventListener {};
547 struct FlushJobInfo {};
548
549 #endif // ROCKSDB_LITE
550
551 } // namespace ROCKSDB_NAMESPACE