]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/statistics.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / include / rocksdb / statistics.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7
8 #include <atomic>
9 #include <cstddef>
10 #include <cstdint>
11 #include <map>
12 #include <memory>
13 #include <string>
14 #include <vector>
15
16 #include "rocksdb/status.h"
17
18 namespace rocksdb {
19
20 /**
21 * Keep adding ticker's here.
22 * 1. Any ticker should be added before TICKER_ENUM_MAX.
23 * 2. Add a readable string in TickersNameMap below for the newly added ticker.
24 * 3. Add a corresponding enum value to TickerType.java in the java API
25 * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
26 * and toCppTickers
27 */
28 enum Tickers : uint32_t {
29 // total block cache misses
30 // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
31 // BLOCK_CACHE_FILTER_MISS +
32 // BLOCK_CACHE_DATA_MISS;
33 BLOCK_CACHE_MISS = 0,
34 // total block cache hit
35 // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
36 // BLOCK_CACHE_FILTER_HIT +
37 // BLOCK_CACHE_DATA_HIT;
38 BLOCK_CACHE_HIT,
39 // # of blocks added to block cache.
40 BLOCK_CACHE_ADD,
41 // # of failures when adding blocks to block cache.
42 BLOCK_CACHE_ADD_FAILURES,
43 // # of times cache miss when accessing index block from block cache.
44 BLOCK_CACHE_INDEX_MISS,
45 // # of times cache hit when accessing index block from block cache.
46 BLOCK_CACHE_INDEX_HIT,
47 // # of index blocks added to block cache.
48 BLOCK_CACHE_INDEX_ADD,
49 // # of bytes of index blocks inserted into cache
50 BLOCK_CACHE_INDEX_BYTES_INSERT,
51 // # of bytes of index block erased from cache
52 BLOCK_CACHE_INDEX_BYTES_EVICT,
53 // # of times cache miss when accessing filter block from block cache.
54 BLOCK_CACHE_FILTER_MISS,
55 // # of times cache hit when accessing filter block from block cache.
56 BLOCK_CACHE_FILTER_HIT,
57 // # of filter blocks added to block cache.
58 BLOCK_CACHE_FILTER_ADD,
59 // # of bytes of bloom filter blocks inserted into cache
60 BLOCK_CACHE_FILTER_BYTES_INSERT,
61 // # of bytes of bloom filter block erased from cache
62 BLOCK_CACHE_FILTER_BYTES_EVICT,
63 // # of times cache miss when accessing data block from block cache.
64 BLOCK_CACHE_DATA_MISS,
65 // # of times cache hit when accessing data block from block cache.
66 BLOCK_CACHE_DATA_HIT,
67 // # of data blocks added to block cache.
68 BLOCK_CACHE_DATA_ADD,
69 // # of bytes of data blocks inserted into cache
70 BLOCK_CACHE_DATA_BYTES_INSERT,
71 // # of bytes read from cache.
72 BLOCK_CACHE_BYTES_READ,
73 // # of bytes written into cache.
74 BLOCK_CACHE_BYTES_WRITE,
75
76 // # of times bloom filter has avoided file reads, i.e., negatives.
77 BLOOM_FILTER_USEFUL,
78 // # of times bloom FullFilter has not avoided the reads.
79 BLOOM_FILTER_FULL_POSITIVE,
80 // # of times bloom FullFilter has not avoided the reads and data actually
81 // exist.
82 BLOOM_FILTER_FULL_TRUE_POSITIVE,
83
84 // # persistent cache hit
85 PERSISTENT_CACHE_HIT,
86 // # persistent cache miss
87 PERSISTENT_CACHE_MISS,
88
89 // # total simulation block cache hits
90 SIM_BLOCK_CACHE_HIT,
91 // # total simulation block cache misses
92 SIM_BLOCK_CACHE_MISS,
93
94 // # of memtable hits.
95 MEMTABLE_HIT,
96 // # of memtable misses.
97 MEMTABLE_MISS,
98
99 // # of Get() queries served by L0
100 GET_HIT_L0,
101 // # of Get() queries served by L1
102 GET_HIT_L1,
103 // # of Get() queries served by L2 and up
104 GET_HIT_L2_AND_UP,
105
106 /**
107 * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
108 * There are 4 reasons currently.
109 */
110 COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
111 // Also includes keys dropped for range del.
112 COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
113 COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone.
114 COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
115 COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted.
116 // Deletions obsoleted before bottom level due to file gap optimization.
117 COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
118 // If a compaction was cancelled in sfm to prevent ENOSPC
119 COMPACTION_CANCELLED,
120
121 // Number of keys written to the database via the Put and Write call's
122 NUMBER_KEYS_WRITTEN,
123 // Number of Keys read,
124 NUMBER_KEYS_READ,
125 // Number keys updated, if inplace update is enabled
126 NUMBER_KEYS_UPDATED,
127 // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
128 // DB::Merge(), and DB::Write().
129 BYTES_WRITTEN,
130 // The number of uncompressed bytes read from DB::Get(). It could be
131 // either from memtables, cache, or table files.
132 // For the number of logical bytes read from DB::MultiGet(),
133 // please use NUMBER_MULTIGET_BYTES_READ.
134 BYTES_READ,
135 // The number of calls to seek/next/prev
136 NUMBER_DB_SEEK,
137 NUMBER_DB_NEXT,
138 NUMBER_DB_PREV,
139 // The number of calls to seek/next/prev that returned data
140 NUMBER_DB_SEEK_FOUND,
141 NUMBER_DB_NEXT_FOUND,
142 NUMBER_DB_PREV_FOUND,
143 // The number of uncompressed bytes read from an iterator.
144 // Includes size of key and value.
145 ITER_BYTES_READ,
146 NO_FILE_CLOSES,
147 NO_FILE_OPENS,
148 NO_FILE_ERRORS,
149 // DEPRECATED Time system had to wait to do LO-L1 compactions
150 STALL_L0_SLOWDOWN_MICROS,
151 // DEPRECATED Time system had to wait to move memtable to L1.
152 STALL_MEMTABLE_COMPACTION_MICROS,
153 // DEPRECATED write throttle because of too many files in L0
154 STALL_L0_NUM_FILES_MICROS,
155 // Writer has to wait for compaction or flush to finish.
156 STALL_MICROS,
157 // The wait time for db mutex.
158 // Disabled by default. To enable it set stats level to kAll
159 DB_MUTEX_WAIT_MICROS,
160 RATE_LIMIT_DELAY_MILLIS,
161 // DEPRECATED number of iterators currently open
162 NO_ITERATORS,
163
164 // Number of MultiGet calls, keys read, and bytes read
165 NUMBER_MULTIGET_CALLS,
166 NUMBER_MULTIGET_KEYS_READ,
167 NUMBER_MULTIGET_BYTES_READ,
168
169 // Number of deletes records that were not required to be
170 // written to storage because key does not exist
171 NUMBER_FILTERED_DELETES,
172 NUMBER_MERGE_FAILURES,
173
174 // number of times bloom was checked before creating iterator on a
175 // file, and the number of times the check was useful in avoiding
176 // iterator creation (and thus likely IOPs).
177 BLOOM_FILTER_PREFIX_CHECKED,
178 BLOOM_FILTER_PREFIX_USEFUL,
179
180 // Number of times we had to reseek inside an iteration to skip
181 // over large number of keys with same userkey.
182 NUMBER_OF_RESEEKS_IN_ITERATION,
183
184 // Record the number of calls to GetUpadtesSince. Useful to keep track of
185 // transaction log iterator refreshes
186 GET_UPDATES_SINCE_CALLS,
187 BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
188 BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
189 // Number of blocks added to compressed block cache
190 BLOCK_CACHE_COMPRESSED_ADD,
191 // Number of failures when adding blocks to compressed block cache
192 BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
193 WAL_FILE_SYNCED, // Number of times WAL sync is done
194 WAL_FILE_BYTES, // Number of bytes written to WAL
195
196 // Writes can be processed by requesting thread or by the thread at the
197 // head of the writers queue.
198 WRITE_DONE_BY_SELF,
199 WRITE_DONE_BY_OTHER, // Equivalent to writes done for others
200 WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
201 WRITE_WITH_WAL, // Number of Write calls that request WAL
202 COMPACT_READ_BYTES, // Bytes read during compaction
203 COMPACT_WRITE_BYTES, // Bytes written during compaction
204 FLUSH_WRITE_BYTES, // Bytes written during flush
205
206 // Number of table's properties loaded directly from file, without creating
207 // table reader object.
208 NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
209 NUMBER_SUPERVERSION_ACQUIRES,
210 NUMBER_SUPERVERSION_RELEASES,
211 NUMBER_SUPERVERSION_CLEANUPS,
212
213 // # of compressions/decompressions executed
214 NUMBER_BLOCK_COMPRESSED,
215 NUMBER_BLOCK_DECOMPRESSED,
216
217 NUMBER_BLOCK_NOT_COMPRESSED,
218 MERGE_OPERATION_TOTAL_TIME,
219 FILTER_OPERATION_TOTAL_TIME,
220
221 // Row cache.
222 ROW_CACHE_HIT,
223 ROW_CACHE_MISS,
224
225 // Read amplification statistics.
226 // Read amplification can be calculated using this formula
227 // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
228 //
229 // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
230 READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used.
231 READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks.
232
233 // Number of refill intervals where rate limiter's bytes are fully consumed.
234 NUMBER_RATE_LIMITER_DRAINS,
235
236 // Number of internal keys skipped by Iterator
237 NUMBER_ITER_SKIP,
238
239 // BlobDB specific stats
240 // # of Put/PutTTL/PutUntil to BlobDB.
241 BLOB_DB_NUM_PUT,
242 // # of Write to BlobDB.
243 BLOB_DB_NUM_WRITE,
244 // # of Get to BlobDB.
245 BLOB_DB_NUM_GET,
246 // # of MultiGet to BlobDB.
247 BLOB_DB_NUM_MULTIGET,
248 // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
249 BLOB_DB_NUM_SEEK,
250 // # of Next to BlobDB iterator.
251 BLOB_DB_NUM_NEXT,
252 // # of Prev to BlobDB iterator.
253 BLOB_DB_NUM_PREV,
254 // # of keys written to BlobDB.
255 BLOB_DB_NUM_KEYS_WRITTEN,
256 // # of keys read from BlobDB.
257 BLOB_DB_NUM_KEYS_READ,
258 // # of bytes (key + value) written to BlobDB.
259 BLOB_DB_BYTES_WRITTEN,
260 // # of bytes (keys + value) read from BlobDB.
261 BLOB_DB_BYTES_READ,
262 // # of keys written by BlobDB as non-TTL inlined value.
263 BLOB_DB_WRITE_INLINED,
264 // # of keys written by BlobDB as TTL inlined value.
265 BLOB_DB_WRITE_INLINED_TTL,
266 // # of keys written by BlobDB as non-TTL blob value.
267 BLOB_DB_WRITE_BLOB,
268 // # of keys written by BlobDB as TTL blob value.
269 BLOB_DB_WRITE_BLOB_TTL,
270 // # of bytes written to blob file.
271 BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
272 // # of bytes read from blob file.
273 BLOB_DB_BLOB_FILE_BYTES_READ,
274 // # of times a blob files being synced.
275 BLOB_DB_BLOB_FILE_SYNCED,
276 // # of blob index evicted from base DB by BlobDB compaction filter because
277 // of expiration.
278 BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
279 // size of blob index evicted from base DB by BlobDB compaction filter
280 // because of expiration.
281 BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
282 // # of blob index evicted from base DB by BlobDB compaction filter because
283 // of corresponding file deleted.
284 BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
285 // size of blob index evicted from base DB by BlobDB compaction filter
286 // because of corresponding file deleted.
287 BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
288 // # of blob files being garbage collected.
289 BLOB_DB_GC_NUM_FILES,
290 // # of blob files generated by garbage collection.
291 BLOB_DB_GC_NUM_NEW_FILES,
292 // # of BlobDB garbage collection failures.
293 BLOB_DB_GC_FAILURES,
294 // # of keys drop by BlobDB garbage collection because they had been
295 // overwritten.
296 BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
297 // # of keys drop by BlobDB garbage collection because of expiration.
298 BLOB_DB_GC_NUM_KEYS_EXPIRED,
299 // # of keys relocated to new blob file by garbage collection.
300 BLOB_DB_GC_NUM_KEYS_RELOCATED,
301 // # of bytes drop by BlobDB garbage collection because they had been
302 // overwritten.
303 BLOB_DB_GC_BYTES_OVERWRITTEN,
304 // # of bytes drop by BlobDB garbage collection because of expiration.
305 BLOB_DB_GC_BYTES_EXPIRED,
306 // # of bytes relocated to new blob file by garbage collection.
307 BLOB_DB_GC_BYTES_RELOCATED,
308 // # of blob files evicted because of BlobDB is full.
309 BLOB_DB_FIFO_NUM_FILES_EVICTED,
310 // # of keys in the blob files evicted because of BlobDB is full.
311 BLOB_DB_FIFO_NUM_KEYS_EVICTED,
312 // # of bytes in the blob files evicted because of BlobDB is full.
313 BLOB_DB_FIFO_BYTES_EVICTED,
314
315 // These counters indicate a performance issue in WritePrepared transactions.
316 // We should not seem them ticking them much.
317 // # of times prepare_mutex_ is acquired in the fast path.
318 TXN_PREPARE_MUTEX_OVERHEAD,
319 // # of times old_commit_map_mutex_ is acquired in the fast path.
320 TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
321 // # of times we checked a batch for duplicate keys.
322 TXN_DUPLICATE_KEY_OVERHEAD,
323 // # of times snapshot_mutex_ is acquired in the fast path.
324 TXN_SNAPSHOT_MUTEX_OVERHEAD,
325
326 // Number of keys actually found in MultiGet calls (vs number requested by
327 // caller)
328 // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
329 NUMBER_MULTIGET_KEYS_FOUND,
330
331 NO_ITERATOR_CREATED, // number of iterators created
332 NO_ITERATOR_DELETED, // number of iterators deleted
333
334 BLOCK_CACHE_COMPRESSION_DICT_MISS,
335 BLOCK_CACHE_COMPRESSION_DICT_HIT,
336 BLOCK_CACHE_COMPRESSION_DICT_ADD,
337 BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
338 BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
339 TICKER_ENUM_MAX
340 };
341
342 // The order of items listed in Tickers should be the same as
343 // the order listed in TickersNameMap
344 extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
345
346 /**
347 * Keep adding histogram's here.
348 * Any histogram should have value less than HISTOGRAM_ENUM_MAX
349 * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
350 * Add a string representation in HistogramsNameMap below
351 * And increment HISTOGRAM_ENUM_MAX
352 * Add a corresponding enum value to HistogramType.java in the java API
353 */
354 enum Histograms : uint32_t {
355 DB_GET = 0,
356 DB_WRITE,
357 COMPACTION_TIME,
358 COMPACTION_CPU_TIME,
359 SUBCOMPACTION_SETUP_TIME,
360 TABLE_SYNC_MICROS,
361 COMPACTION_OUTFILE_SYNC_MICROS,
362 WAL_FILE_SYNC_MICROS,
363 MANIFEST_FILE_SYNC_MICROS,
364 // TIME SPENT IN IO DURING TABLE OPEN
365 TABLE_OPEN_IO_MICROS,
366 DB_MULTIGET,
367 READ_BLOCK_COMPACTION_MICROS,
368 READ_BLOCK_GET_MICROS,
369 WRITE_RAW_BLOCK_MICROS,
370 STALL_L0_SLOWDOWN_COUNT,
371 STALL_MEMTABLE_COMPACTION_COUNT,
372 STALL_L0_NUM_FILES_COUNT,
373 HARD_RATE_LIMIT_DELAY_COUNT,
374 SOFT_RATE_LIMIT_DELAY_COUNT,
375 NUM_FILES_IN_SINGLE_COMPACTION,
376 DB_SEEK,
377 WRITE_STALL,
378 SST_READ_MICROS,
379 // The number of subcompactions actually scheduled during a compaction
380 NUM_SUBCOMPACTIONS_SCHEDULED,
381 // Value size distribution in each operation
382 BYTES_PER_READ,
383 BYTES_PER_WRITE,
384 BYTES_PER_MULTIGET,
385
386 // number of bytes compressed/decompressed
387 // number of bytes is when uncompressed; i.e. before/after respectively
388 BYTES_COMPRESSED,
389 BYTES_DECOMPRESSED,
390 COMPRESSION_TIMES_NANOS,
391 DECOMPRESSION_TIMES_NANOS,
392 // Number of merge operands passed to the merge operator in user read
393 // requests.
394 READ_NUM_MERGE_OPERANDS,
395
396 // BlobDB specific stats
397 // Size of keys written to BlobDB.
398 BLOB_DB_KEY_SIZE,
399 // Size of values written to BlobDB.
400 BLOB_DB_VALUE_SIZE,
401 // BlobDB Put/PutWithTTL/PutUntil/Write latency.
402 BLOB_DB_WRITE_MICROS,
403 // BlobDB Get lagency.
404 BLOB_DB_GET_MICROS,
405 // BlobDB MultiGet latency.
406 BLOB_DB_MULTIGET_MICROS,
407 // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
408 BLOB_DB_SEEK_MICROS,
409 // BlobDB Next latency.
410 BLOB_DB_NEXT_MICROS,
411 // BlobDB Prev latency.
412 BLOB_DB_PREV_MICROS,
413 // Blob file write latency.
414 BLOB_DB_BLOB_FILE_WRITE_MICROS,
415 // Blob file read latency.
416 BLOB_DB_BLOB_FILE_READ_MICROS,
417 // Blob file sync latency.
418 BLOB_DB_BLOB_FILE_SYNC_MICROS,
419 // BlobDB garbage collection time.
420 BLOB_DB_GC_MICROS,
421 // BlobDB compression time.
422 BLOB_DB_COMPRESSION_MICROS,
423 // BlobDB decompression time.
424 BLOB_DB_DECOMPRESSION_MICROS,
425 // Time spent flushing memtable to disk
426 FLUSH_TIME,
427
428 HISTOGRAM_ENUM_MAX,
429 };
430
431 extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
432
433 struct HistogramData {
434 double median;
435 double percentile95;
436 double percentile99;
437 double average;
438 double standard_deviation;
439 // zero-initialize new members since old Statistics::histogramData()
440 // implementations won't write them.
441 double max = 0.0;
442 uint64_t count = 0;
443 uint64_t sum = 0;
444 double min = 0.0;
445 };
446
447 enum StatsLevel : uint8_t {
448 // Disable timer stats, and skip histogram stats
449 kExceptHistogramOrTimers,
450 // Skip timer stats
451 kExceptTimers,
452 // Collect all stats except time inside mutex lock AND time spent on
453 // compression.
454 kExceptDetailedTimers,
455 // Collect all stats except the counters requiring to get time inside the
456 // mutex lock.
457 kExceptTimeForMutex,
458 // Collect all stats, including measuring duration of mutex operations.
459 // If getting time is expensive on the platform to run, it can
460 // reduce scalability to more threads, especially for writes.
461 kAll,
462 };
463
464 // Analyze the performance of a db
465 class Statistics {
466 public:
467 virtual ~Statistics() {}
468
469 virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
470 virtual void histogramData(uint32_t type,
471 HistogramData* const data) const = 0;
472 virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
473 virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
474 virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
475 virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
476 virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
477 if (get_stats_level() <= StatsLevel::kExceptTimers) {
478 return;
479 }
480 recordInHistogram(histogramType, time);
481 }
482 // The function is here only for backward compatibility reason.
483 // Users implementing their own Statistics class should override
484 // recordInHistogram() instead and leave measureTime() as it is.
485 virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
486 // This is not supposed to be called.
487 assert(false);
488 }
489 virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
490 // measureTime() is the old and inaccurate function name.
491 // To keep backward compatible. If users implement their own
492 // statistics, which overrides meareTime() but doesn't override
493 // this function. We forward to measureTime().
494 measureTime(histogramType, time);
495 }
496
497 // Resets all ticker and histogram stats
498 virtual Status Reset() { return Status::NotSupported("Not implemented"); }
499
500 // String representation of the statistic object.
501 virtual std::string ToString() const {
502 // Do nothing by default
503 return std::string("ToString(): not implemented");
504 }
505
506 virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
507 // Do nothing by default
508 return false;
509 };
510
511 // Override this function to disable particular histogram collection
512 virtual bool HistEnabledForType(uint32_t type) const {
513 return type < HISTOGRAM_ENUM_MAX;
514 }
515 void set_stats_level(StatsLevel sl) {
516 stats_level_.store(sl, std::memory_order_relaxed);
517 }
518 StatsLevel get_stats_level() const {
519 return stats_level_.load(std::memory_order_relaxed);
520 }
521
522 private:
523 std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
524 };
525
526 // Create a concrete DBStatistics object
527 std::shared_ptr<Statistics> CreateDBStatistics();
528
529 } // namespace rocksdb