]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/listener.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / include / rocksdb / listener.h
1 // Copyright (c) 2014 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5 #pragma once
6
7 #include <chrono>
8 #include <memory>
9 #include <string>
10 #include <unordered_map>
11 #include <vector>
12 #include "rocksdb/compaction_job_stats.h"
13 #include "rocksdb/status.h"
14 #include "rocksdb/table_properties.h"
15
16 namespace rocksdb {
17
18 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
19 TablePropertiesCollection;
20
21 class DB;
22 class ColumnFamilyHandle;
23 class Status;
24 struct CompactionJobStats;
25 enum CompressionType : unsigned char;
26
27 enum class TableFileCreationReason {
28 kFlush,
29 kCompaction,
30 kRecovery,
31 kMisc,
32 };
33
34 struct TableFileCreationBriefInfo {
35 // the name of the database where the file was created
36 std::string db_name;
37 // the name of the column family where the file was created.
38 std::string cf_name;
39 // the path to the created file.
40 std::string file_path;
41 // the id of the job (which could be flush or compaction) that
42 // created the file.
43 int job_id;
44 // reason of creating the table.
45 TableFileCreationReason reason;
46 };
47
48 struct TableFileCreationInfo : public TableFileCreationBriefInfo {
49 TableFileCreationInfo() = default;
50 explicit TableFileCreationInfo(TableProperties&& prop)
51 : table_properties(prop) {}
52 // the size of the file.
53 uint64_t file_size;
54 // Detailed properties of the created file.
55 TableProperties table_properties;
56 // The status indicating whether the creation was successful or not.
57 Status status;
58 };
59
60 enum class CompactionReason : int {
61 kUnknown = 0,
62 // [Level] number of L0 files > level0_file_num_compaction_trigger
63 kLevelL0FilesNum,
64 // [Level] total size of level > MaxBytesForLevel()
65 kLevelMaxLevelSize,
66 // [Universal] Compacting for size amplification
67 kUniversalSizeAmplification,
68 // [Universal] Compacting for size ratio
69 kUniversalSizeRatio,
70 // [Universal] number of sorted runs > level0_file_num_compaction_trigger
71 kUniversalSortedRunNum,
72 // [FIFO] total size > max_table_files_size
73 kFIFOMaxSize,
74 // [FIFO] reduce number of files.
75 kFIFOReduceNumFiles,
76 // [FIFO] files with creation time < (current_time - interval)
77 kFIFOTtl,
78 // Manual compaction
79 kManualCompaction,
80 // DB::SuggestCompactRange() marked files for compaction
81 kFilesMarkedForCompaction,
82 // [Level] Automatic compaction within bottommost level to cleanup duplicate
83 // versions of same user key, usually due to a released snapshot.
84 kBottommostFiles,
85 // Compaction based on TTL
86 kTtl,
87 // According to the comments in flush_job.cc, RocksDB treats flush as
88 // a level 0 compaction in internal stats.
89 kFlush,
90 // Compaction caused by external sst file ingestion
91 kExternalSstIngestion,
92 // total number of compaction reasons, new reasons must be added above this.
93 kNumOfReasons,
94 };
95
96 enum class FlushReason : int {
97 kOthers = 0x00,
98 kGetLiveFiles = 0x01,
99 kShutDown = 0x02,
100 kExternalFileIngestion = 0x03,
101 kManualCompaction = 0x04,
102 kWriteBufferManager = 0x05,
103 kWriteBufferFull = 0x06,
104 kTest = 0x07,
105 kDeleteFiles = 0x08,
106 kAutoCompaction = 0x09,
107 kManualFlush = 0x0a,
108 kErrorRecovery = 0xb,
109 };
110
111 enum class BackgroundErrorReason {
112 kFlush,
113 kCompaction,
114 kWriteCallback,
115 kMemTable,
116 };
117
118 enum class WriteStallCondition {
119 kNormal,
120 kDelayed,
121 kStopped,
122 };
123
124 struct WriteStallInfo {
125 // the name of the column family
126 std::string cf_name;
127 // state of the write controller
128 struct {
129 WriteStallCondition cur;
130 WriteStallCondition prev;
131 } condition;
132 };
133
134 #ifndef ROCKSDB_LITE
135
136 struct TableFileDeletionInfo {
137 // The name of the database where the file was deleted.
138 std::string db_name;
139 // The path to the deleted file.
140 std::string file_path;
141 // The id of the job which deleted the file.
142 int job_id;
143 // The status indicating whether the deletion was successful or not.
144 Status status;
145 };
146
147 struct FileOperationInfo {
148 using TimePoint = std::chrono::time_point<std::chrono::system_clock,
149 std::chrono::nanoseconds>;
150
151 const std::string& path;
152 uint64_t offset;
153 size_t length;
154 const TimePoint& start_timestamp;
155 const TimePoint& finish_timestamp;
156 Status status;
157 FileOperationInfo(const std::string& _path, const TimePoint& start,
158 const TimePoint& finish)
159 : path(_path), start_timestamp(start), finish_timestamp(finish) {}
160 };
161
162 struct FlushJobInfo {
163 // the id of the column family
164 uint32_t cf_id;
165 // the name of the column family
166 std::string cf_name;
167 // the path to the newly created file
168 std::string file_path;
169 // the id of the thread that completed this flush job.
170 uint64_t thread_id;
171 // the job id, which is unique in the same thread.
172 int job_id;
173 // If true, then rocksdb is currently slowing-down all writes to prevent
174 // creating too many Level 0 files as compaction seems not able to
175 // catch up the write request speed. This indicates that there are
176 // too many files in Level 0.
177 bool triggered_writes_slowdown;
178 // If true, then rocksdb is currently blocking any writes to prevent
179 // creating more L0 files. This indicates that there are too many
180 // files in level 0. Compactions should try to compact L0 files down
181 // to lower levels as soon as possible.
182 bool triggered_writes_stop;
183 // The smallest sequence number in the newly created file
184 SequenceNumber smallest_seqno;
185 // The largest sequence number in the newly created file
186 SequenceNumber largest_seqno;
187 // Table properties of the table being flushed
188 TableProperties table_properties;
189
190 FlushReason flush_reason;
191 };
192
193 struct CompactionJobInfo {
194 CompactionJobInfo() = default;
195 explicit CompactionJobInfo(const CompactionJobStats& _stats)
196 : stats(_stats) {}
197
198 // the id of the column family where the compaction happened.
199 uint32_t cf_id;
200 // the name of the column family where the compaction happened.
201 std::string cf_name;
202 // the status indicating whether the compaction was successful or not.
203 Status status;
204 // the id of the thread that completed this compaction job.
205 uint64_t thread_id;
206 // the job id, which is unique in the same thread.
207 int job_id;
208 // the smallest input level of the compaction.
209 int base_input_level;
210 // the output level of the compaction.
211 int output_level;
212 // the names of the compaction input files.
213 std::vector<std::string> input_files;
214
215 // the names of the compaction output files.
216 std::vector<std::string> output_files;
217 // Table properties for input and output tables.
218 // The map is keyed by values from input_files and output_files.
219 TablePropertiesCollection table_properties;
220
221 // Reason to run the compaction
222 CompactionReason compaction_reason;
223
224 // Compression algorithm used for output files
225 CompressionType compression;
226
227 // If non-null, this variable stores detailed information
228 // about this compaction.
229 CompactionJobStats stats;
230 };
231
232 struct MemTableInfo {
233 // the name of the column family to which memtable belongs
234 std::string cf_name;
235 // Sequence number of the first element that was inserted
236 // into the memtable.
237 SequenceNumber first_seqno;
238 // Sequence number that is guaranteed to be smaller than or equal
239 // to the sequence number of any key that could be inserted into this
240 // memtable. It can then be assumed that any write with a larger(or equal)
241 // sequence number will be present in this memtable or a later memtable.
242 SequenceNumber earliest_seqno;
243 // Total number of entries in memtable
244 uint64_t num_entries;
245 // Total number of deletes in memtable
246 uint64_t num_deletes;
247 };
248
249 struct ExternalFileIngestionInfo {
250 // the name of the column family
251 std::string cf_name;
252 // Path of the file outside the DB
253 std::string external_file_path;
254 // Path of the file inside the DB
255 std::string internal_file_path;
256 // The global sequence number assigned to keys in this file
257 SequenceNumber global_seqno;
258 // Table properties of the table being flushed
259 TableProperties table_properties;
260 };
261
262 // EventListener class contains a set of callback functions that will
263 // be called when specific RocksDB event happens such as flush. It can
264 // be used as a building block for developing custom features such as
265 // stats-collector or external compaction algorithm.
266 //
267 // Note that callback functions should not run for an extended period of
268 // time before the function returns, otherwise RocksDB may be blocked.
269 // For example, it is not suggested to do DB::CompactFiles() (as it may
270 // run for a long while) or issue many of DB::Put() (as Put may be blocked
271 // in certain cases) in the same thread in the EventListener callback.
272 // However, doing DB::CompactFiles() and DB::Put() in another thread is
273 // considered safe.
274 //
275 // [Threading] All EventListener callback will be called using the
276 // actual thread that involves in that specific event. For example, it
277 // is the RocksDB background flush thread that does the actual flush to
278 // call EventListener::OnFlushCompleted().
279 //
280 // [Locking] All EventListener callbacks are designed to be called without
281 // the current thread holding any DB mutex. This is to prevent potential
282 // deadlock and performance issue when using EventListener callback
283 // in a complex way.
284 class EventListener {
285 public:
286 // A callback function to RocksDB which will be called whenever a
287 // registered RocksDB flushes a file. The default implementation is
288 // no-op.
289 //
290 // Note that the this function must be implemented in a way such that
291 // it should not run for an extended period of time before the function
292 // returns. Otherwise, RocksDB may be blocked.
293 virtual void OnFlushCompleted(DB* /*db*/,
294 const FlushJobInfo& /*flush_job_info*/) {}
295
296 // A callback function to RocksDB which will be called before a
297 // RocksDB starts to flush memtables. The default implementation is
298 // no-op.
299 //
300 // Note that the this function must be implemented in a way such that
301 // it should not run for an extended period of time before the function
302 // returns. Otherwise, RocksDB may be blocked.
303 virtual void OnFlushBegin(DB* /*db*/,
304 const FlushJobInfo& /*flush_job_info*/) {}
305
306 // A callback function for RocksDB which will be called whenever
307 // a SST file is deleted. Different from OnCompactionCompleted and
308 // OnFlushCompleted, this callback is designed for external logging
309 // service and thus only provide string parameters instead
310 // of a pointer to DB. Applications that build logic basic based
311 // on file creations and deletions is suggested to implement
312 // OnFlushCompleted and OnCompactionCompleted.
313 //
314 // Note that if applications would like to use the passed reference
315 // outside this function call, they should make copies from the
316 // returned value.
317 virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
318
319 // A callback function to RocksDB which will be called before a
320 // RocksDB starts to compact. The default implementation is
321 // no-op.
322 //
323 // Note that the this function must be implemented in a way such that
324 // it should not run for an extended period of time before the function
325 // returns. Otherwise, RocksDB may be blocked.
326 virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
327
328 // A callback function for RocksDB which will be called whenever
329 // a registered RocksDB compacts a file. The default implementation
330 // is a no-op.
331 //
332 // Note that this function must be implemented in a way such that
333 // it should not run for an extended period of time before the function
334 // returns. Otherwise, RocksDB may be blocked.
335 //
336 // @param db a pointer to the rocksdb instance which just compacted
337 // a file.
338 // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
339 // after this function is returned, and must be copied if it is needed
340 // outside of this function.
341 virtual void OnCompactionCompleted(DB* /*db*/,
342 const CompactionJobInfo& /*ci*/) {}
343
344 // A callback function for RocksDB which will be called whenever
345 // a SST file is created. Different from OnCompactionCompleted and
346 // OnFlushCompleted, this callback is designed for external logging
347 // service and thus only provide string parameters instead
348 // of a pointer to DB. Applications that build logic basic based
349 // on file creations and deletions is suggested to implement
350 // OnFlushCompleted and OnCompactionCompleted.
351 //
352 // Historically it will only be called if the file is successfully created.
353 // Now it will also be called on failure case. User can check info.status
354 // to see if it succeeded or not.
355 //
356 // Note that if applications would like to use the passed reference
357 // outside this function call, they should make copies from these
358 // returned value.
359 virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
360
361 // A callback function for RocksDB which will be called before
362 // a SST file is being created. It will follow by OnTableFileCreated after
363 // the creation finishes.
364 //
365 // Note that if applications would like to use the passed reference
366 // outside this function call, they should make copies from these
367 // returned value.
368 virtual void OnTableFileCreationStarted(
369 const TableFileCreationBriefInfo& /*info*/) {}
370
371 // A callback function for RocksDB which will be called before
372 // a memtable is made immutable.
373 //
374 // Note that the this function must be implemented in a way such that
375 // it should not run for an extended period of time before the function
376 // returns. Otherwise, RocksDB may be blocked.
377 //
378 // Note that if applications would like to use the passed reference
379 // outside this function call, they should make copies from these
380 // returned value.
381 virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
382
383 // A callback function for RocksDB which will be called before
384 // a column family handle is deleted.
385 //
386 // Note that the this function must be implemented in a way such that
387 // it should not run for an extended period of time before the function
388 // returns. Otherwise, RocksDB may be blocked.
389 // @param handle is a pointer to the column family handle to be deleted
390 // which will become a dangling pointer after the deletion.
391 virtual void OnColumnFamilyHandleDeletionStarted(
392 ColumnFamilyHandle* /*handle*/) {}
393
394 // A callback function for RocksDB which will be called after an external
395 // file is ingested using IngestExternalFile.
396 //
397 // Note that the this function will run on the same thread as
398 // IngestExternalFile(), if this function is blocked, IngestExternalFile()
399 // will be blocked from finishing.
400 virtual void OnExternalFileIngested(
401 DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
402
403 // A callback function for RocksDB which will be called before setting the
404 // background error status to a non-OK value. The new background error status
405 // is provided in `bg_error` and can be modified by the callback. E.g., a
406 // callback can suppress errors by resetting it to Status::OK(), thus
407 // preventing the database from entering read-only mode. We do not provide any
408 // guarantee when failed flushes/compactions will be rescheduled if the user
409 // suppresses an error.
410 //
411 // Note that this function can run on the same threads as flush, compaction,
412 // and user writes. So, it is extremely important not to perform heavy
413 // computations or blocking calls in this function.
414 virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
415 Status* /* bg_error */) {}
416
417 // A callback function for RocksDB which will be called whenever a change
418 // of superversion triggers a change of the stall conditions.
419 //
420 // Note that the this function must be implemented in a way such that
421 // it should not run for an extended period of time before the function
422 // returns. Otherwise, RocksDB may be blocked.
423 virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
424
425 // A callback function for RocksDB which will be called whenever a file read
426 // operation finishes.
427 virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
428
429 // A callback function for RocksDB which will be called whenever a file write
430 // operation finishes.
431 virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
432
433 // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
434 // false, then they won't be called.
435 virtual bool ShouldBeNotifiedOnFileIO() { return false; }
436
437 // A callback function for RocksDB which will be called just before
438 // starting the automatic recovery process for recoverable background
439 // errors, such as NoSpace(). The callback can suppress the automatic
440 // recovery by setting *auto_recovery to false. The database will then
441 // have to be transitioned out of read-only mode by calling DB::Resume()
442 virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
443 Status /* bg_error */,
444 bool* /* auto_recovery */) {}
445
446 // A callback function for RocksDB which will be called once the database
447 // is recovered from read-only mode after an error. When this is called, it
448 // means normal writes to the database can be issued and the user can
449 // initiate any further recovery actions needed
450 virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
451
452 virtual ~EventListener() {}
453 };
454
455 #else
456
457 class EventListener {};
458
459 #endif // ROCKSDB_LITE
460
461 } // namespace rocksdb