]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_perf_counters.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / osd_perf_counters.cc
CommitLineData
9f95a23c
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "osd_perf_counters.h"
5#include "include/common_fwd.h"
6
7
8PerfCounters *build_osd_logger(CephContext *cct) {
9 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
10
11 // Latency axis configuration for op histograms, values are in nanoseconds
12 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
13 "Latency (usec)",
14 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
15 0, ///< Start at 0
16 100000, ///< Quantization unit is 100usec
17 32, ///< Enough to cover much longer than slow requests
18 };
19
20 // Op size axis configuration for op histograms, values are in bytes
21 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
22 "Request size (bytes)",
23 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
24 0, ///< Start at 0
25 512, ///< Quantization unit is 512 bytes
26 32, ///< Enough to cover requests larger than GB
27 };
28
29
30 // All the basic OSD operation stats are to be considered useful
31 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
32
33 osd_plb.add_u64(
34 l_osd_op_wip, "op_wip",
35 "Replication operations currently being processed (primary)");
36 osd_plb.add_u64_counter(
37 l_osd_op, "op",
38 "Client operations",
39 "ops", PerfCountersBuilder::PRIO_CRITICAL);
40 osd_plb.add_u64_counter(
41 l_osd_op_inb, "op_in_bytes",
42 "Client operations total write size",
43 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
44 osd_plb.add_u64_counter(
45 l_osd_op_outb, "op_out_bytes",
46 "Client operations total read size",
47 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
48 osd_plb.add_time_avg(
49 l_osd_op_lat, "op_latency",
50 "Latency of client operations (including queue time)",
51 "l", 9);
52 osd_plb.add_time_avg(
53 l_osd_op_process_lat, "op_process_latency",
54 "Latency of client operations (excluding queue time)");
55 osd_plb.add_time_avg(
56 l_osd_op_prepare_lat, "op_prepare_latency",
57 "Latency of client operations (excluding queue time and wait for finished)");
58
1e59de90
TL
59 osd_plb.add_u64_counter(
60 l_osd_op_delayed_unreadable, "op_delayed_unreadable",
61 "Count of ops delayed due to target object being unreadable");
62 osd_plb.add_u64_counter(
63 l_osd_op_delayed_degraded, "op_delayed_degraded",
64 "Count of ops delayed due to target object being degraded");
65
9f95a23c
TL
66 osd_plb.add_u64_counter(
67 l_osd_op_r, "op_r", "Client read operations");
68 osd_plb.add_u64_counter(
69 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
70 osd_plb.add_time_avg(
71 l_osd_op_r_lat, "op_r_latency",
72 "Latency of read operation (including queue time)");
73 osd_plb.add_u64_counter_histogram(
74 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
75 op_hist_x_axis_config, op_hist_y_axis_config,
76 "Histogram of operation latency (including queue time) + data read");
77 osd_plb.add_time_avg(
78 l_osd_op_r_process_lat, "op_r_process_latency",
79 "Latency of read operation (excluding queue time)");
80 osd_plb.add_time_avg(
81 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
82 "Latency of read operations (excluding queue time and wait for finished)");
83 osd_plb.add_u64_counter(
84 l_osd_op_w, "op_w", "Client write operations");
85 osd_plb.add_u64_counter(
86 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
87 osd_plb.add_time_avg(
88 l_osd_op_w_lat, "op_w_latency",
89 "Latency of write operation (including queue time)");
90 osd_plb.add_u64_counter_histogram(
91 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
92 op_hist_x_axis_config, op_hist_y_axis_config,
93 "Histogram of operation latency (including queue time) + data written");
94 osd_plb.add_time_avg(
95 l_osd_op_w_process_lat, "op_w_process_latency",
96 "Latency of write operation (excluding queue time)");
97 osd_plb.add_time_avg(
98 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
99 "Latency of write operations (excluding queue time and wait for finished)");
100 osd_plb.add_u64_counter(
101 l_osd_op_rw, "op_rw",
102 "Client read-modify-write operations");
103 osd_plb.add_u64_counter(
104 l_osd_op_rw_inb, "op_rw_in_bytes",
105 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
106 osd_plb.add_u64_counter(
107 l_osd_op_rw_outb,"op_rw_out_bytes",
108 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
109 osd_plb.add_time_avg(
110 l_osd_op_rw_lat, "op_rw_latency",
111 "Latency of read-modify-write operation (including queue time)");
112 osd_plb.add_u64_counter_histogram(
113 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
114 op_hist_x_axis_config, op_hist_y_axis_config,
115 "Histogram of rw operation latency (including queue time) + data written");
116 osd_plb.add_u64_counter_histogram(
117 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
118 op_hist_x_axis_config, op_hist_y_axis_config,
119 "Histogram of rw operation latency (including queue time) + data read");
120 osd_plb.add_time_avg(
121 l_osd_op_rw_process_lat, "op_rw_process_latency",
122 "Latency of read-modify-write operation (excluding queue time)");
123 osd_plb.add_time_avg(
124 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
125 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
126
127 // Now we move on to some more obscure stats, revert to assuming things
128 // are low priority unless otherwise specified.
129 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
130
131 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
132 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
133 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
134 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
135
136 osd_plb.add_u64_counter(
137 l_osd_sop, "subop", "Suboperations");
138 osd_plb.add_u64_counter(
139 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
140 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
141
142 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
143 osd_plb.add_u64_counter(
144 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
145 osd_plb.add_time_avg(
146 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
147 osd_plb.add_u64_counter(
148 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
149 osd_plb.add_time_avg(
150 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
151 osd_plb.add_u64_counter(
152 l_osd_sop_push, "subop_push", "Suboperations push messages");
153 osd_plb.add_u64_counter(
154 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
155 osd_plb.add_time_avg(
156 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
157
158 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
159 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
160 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
161
162 osd_plb.add_u64_counter(
163 l_osd_rop, "recovery_ops",
164 "Started recovery operations",
165 "rop", PerfCountersBuilder::PRIO_INTERESTING);
166
167 osd_plb.add_u64_counter(
168 l_osd_rbytes, "recovery_bytes",
169 "recovery bytes",
170 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
171
1e59de90
TL
172 osd_plb.add_time_avg(
173 l_osd_recovery_push_queue_lat,
174 "l_osd_recovery_push_queue_latency",
175 "MOSDPGPush queue latency");
176 osd_plb.add_time_avg(
177 l_osd_recovery_push_reply_queue_lat,
178 "l_osd_recovery_push_reply_queue_latency",
179 "MOSDPGPushReply queue latency");
180 osd_plb.add_time_avg(
181 l_osd_recovery_pull_queue_lat,
182 "l_osd_recovery_pull_queue_latency",
183 "MOSDPGPull queue latency");
184 osd_plb.add_time_avg(
185 l_osd_recovery_backfill_queue_lat,
186 "l_osd_recovery_backfill_queue_latency",
187 "MOSDPGBackfill queue latency");
188 osd_plb.add_time_avg(
189 l_osd_recovery_backfill_remove_queue_lat,
190 "l_osd_recovery_backfill_remove_queue_latency",
191 "MOSDPGBackfillDelete queue latency");
192 osd_plb.add_time_avg(
193 l_osd_recovery_scan_queue_lat,
194 "l_osd_recovery_scan_queue_latency",
195 "MOSDPGScan queue latency");
196
197 osd_plb.add_time_avg(
198 l_osd_recovery_queue_lat,
199 "l_osd_recovery_queue_latency",
200 "PGRecovery queue latency");
201 osd_plb.add_time_avg(
202 l_osd_recovery_context_queue_lat,
203 "l_osd_recovery_context_queue_latency",
204 "PGRecoveryContext queue latency");
205
9f95a23c
TL
206 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
207 osd_plb.add_u64(
208 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
209 osd_plb.add_u64(
210 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
211 "Total number getting crc from crc_cache with adjusting");
212 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
213 "Total number of crc cache misses");
214
215 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
216 "pgs", PerfCountersBuilder::PRIO_USEFUL);
217 osd_plb.add_u64(
218 l_osd_pg_primary, "numpg_primary",
219 "Placement groups for which this osd is primary");
220 osd_plb.add_u64(
221 l_osd_pg_replica, "numpg_replica",
222 "Placement groups for which this osd is replica");
223 osd_plb.add_u64(
224 l_osd_pg_stray, "numpg_stray",
225 "Placement groups ready to be deleted from this osd");
226 osd_plb.add_u64(
227 l_osd_pg_removing, "numpg_removing",
228 "Placement groups queued for local deletion", "pgsr",
229 PerfCountersBuilder::PRIO_USEFUL);
230 osd_plb.add_u64(
231 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
232 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
233 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
234 osd_plb.add_u64_counter(
235 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
236 osd_plb.add_u64_counter(
237 l_osd_waiting_for_map, "messages_delayed_for_map",
238 "Operations waiting for OSD map");
239
240 osd_plb.add_u64_counter(
241 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
242 osd_plb.add_u64_counter(
243 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
244 osd_plb.add_u64_counter(
245 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
246 "osdmap cache miss below cache lower bound");
247 osd_plb.add_u64_avg(
248 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
249 "osdmap cache miss, avg distance below cache lower bound");
250 osd_plb.add_u64_counter(
251 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
252 "OSDMap buffer cache hits");
253 osd_plb.add_u64_counter(
254 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
255 "OSDMap buffer cache misses");
256
257 osd_plb.add_u64(
258 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
259 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
260 osd_plb.add_u64(
261 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
262 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
263 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
264
265 osd_plb.add_u64_counter(
266 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
267
268 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
269 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
270 osd_plb.add_u64_counter(
271 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
272 osd_plb.add_u64_counter(
273 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
274 osd_plb.add_u64_counter(
275 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
276 "Failed tier flush attempts");
277 osd_plb.add_u64_counter(
278 l_osd_tier_evict, "tier_evict", "Tier evictions");
279 osd_plb.add_u64_counter(
280 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
281 osd_plb.add_u64_counter(
282 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
283 osd_plb.add_u64_counter(
284 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
285 osd_plb.add_u64_counter(
286 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
287 osd_plb.add_u64_counter(
288 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
289 osd_plb.add_u64_counter(
290 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
291
292 osd_plb.add_u64_counter(
293 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
294 osd_plb.add_u64_counter(
295 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
296 osd_plb.add_u64_counter(
297 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
298 osd_plb.add_u64_counter(
299 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
300
301 osd_plb.add_u64_counter(
302 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
303 osd_plb.add_u64_counter(
304 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
305
306 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
307 osd_plb.add_time_avg(
308 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
309 osd_plb.add_time_avg(
310 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
311 osd_plb.add_time_avg(
312 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
313
314 osd_plb.add_u64_counter(
315 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
316 osd_plb.add_u64_counter(
317 l_osd_pg_fastinfo, "osd_pg_fastinfo",
318 "PG updated its info using fastinfo attr");
319 osd_plb.add_u64_counter(
320 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
321
322 return osd_plb.create_perf_counters();
323}
324
325
326PerfCounters *build_recoverystate_perf(CephContext *cct) {
327 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
328
329 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
330 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
331 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
332 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
333 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
334 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
335 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
336 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
337 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
338 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
339 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
340 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
341 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
342 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
343 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
344 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
345 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
346 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
347 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
348 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
349 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
350 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
351 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
352 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
353 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
354 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
355 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
356 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
357 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
358 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
359 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
360
361 return rs_perf.create_perf_counters();
362}