1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "osd_perf_counters.h"
5 #include "include/common_fwd.h"
8 PerfCounters
*build_osd_logger(CephContext
*cct
) {
9 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
11 // Latency axis configuration for op histograms, values are in nanoseconds
12 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
14 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
16 100000, ///< Quantization unit is 100usec
17 32, ///< Enough to cover much longer than slow requests
20 // Op size axis configuration for op histograms, values are in bytes
21 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
22 "Request size (bytes)",
23 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
25 512, ///< Quantization unit is 512 bytes
26 32, ///< Enough to cover requests larger than GB
30 // All the basic OSD operation stats are to be considered useful
31 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
34 l_osd_op_wip
, "op_wip",
35 "Replication operations currently being processed (primary)");
36 osd_plb
.add_u64_counter(
39 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
40 osd_plb
.add_u64_counter(
41 l_osd_op_inb
, "op_in_bytes",
42 "Client operations total write size",
43 "wr", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
44 osd_plb
.add_u64_counter(
45 l_osd_op_outb
, "op_out_bytes",
46 "Client operations total read size",
47 "rd", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
49 l_osd_op_lat
, "op_latency",
50 "Latency of client operations (including queue time)",
53 l_osd_op_process_lat
, "op_process_latency",
54 "Latency of client operations (excluding queue time)");
56 l_osd_op_prepare_lat
, "op_prepare_latency",
57 "Latency of client operations (excluding queue time and wait for finished)");
59 osd_plb
.add_u64_counter(
60 l_osd_op_delayed_unreadable
, "op_delayed_unreadable",
61 "Count of ops delayed due to target object being unreadable");
62 osd_plb
.add_u64_counter(
63 l_osd_op_delayed_degraded
, "op_delayed_degraded",
64 "Count of ops delayed due to target object being degraded");
66 osd_plb
.add_u64_counter(
67 l_osd_op_r
, "op_r", "Client read operations");
68 osd_plb
.add_u64_counter(
69 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
71 l_osd_op_r_lat
, "op_r_latency",
72 "Latency of read operation (including queue time)");
73 osd_plb
.add_u64_counter_histogram(
74 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
75 op_hist_x_axis_config
, op_hist_y_axis_config
,
76 "Histogram of operation latency (including queue time) + data read");
78 l_osd_op_r_process_lat
, "op_r_process_latency",
79 "Latency of read operation (excluding queue time)");
81 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
82 "Latency of read operations (excluding queue time and wait for finished)");
83 osd_plb
.add_u64_counter(
84 l_osd_op_w
, "op_w", "Client write operations");
85 osd_plb
.add_u64_counter(
86 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
88 l_osd_op_w_lat
, "op_w_latency",
89 "Latency of write operation (including queue time)");
90 osd_plb
.add_u64_counter_histogram(
91 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
92 op_hist_x_axis_config
, op_hist_y_axis_config
,
93 "Histogram of operation latency (including queue time) + data written");
95 l_osd_op_w_process_lat
, "op_w_process_latency",
96 "Latency of write operation (excluding queue time)");
98 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
99 "Latency of write operations (excluding queue time and wait for finished)");
100 osd_plb
.add_u64_counter(
101 l_osd_op_rw
, "op_rw",
102 "Client read-modify-write operations");
103 osd_plb
.add_u64_counter(
104 l_osd_op_rw_inb
, "op_rw_in_bytes",
105 "Client read-modify-write operations write in", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
106 osd_plb
.add_u64_counter(
107 l_osd_op_rw_outb
,"op_rw_out_bytes",
108 "Client read-modify-write operations read out ", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
109 osd_plb
.add_time_avg(
110 l_osd_op_rw_lat
, "op_rw_latency",
111 "Latency of read-modify-write operation (including queue time)");
112 osd_plb
.add_u64_counter_histogram(
113 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
114 op_hist_x_axis_config
, op_hist_y_axis_config
,
115 "Histogram of rw operation latency (including queue time) + data written");
116 osd_plb
.add_u64_counter_histogram(
117 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
118 op_hist_x_axis_config
, op_hist_y_axis_config
,
119 "Histogram of rw operation latency (including queue time) + data read");
120 osd_plb
.add_time_avg(
121 l_osd_op_rw_process_lat
, "op_rw_process_latency",
122 "Latency of read-modify-write operation (excluding queue time)");
123 osd_plb
.add_time_avg(
124 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
125 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
127 // Now we move on to some more obscure stats, revert to assuming things
128 // are low priority unless otherwise specified.
129 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
131 osd_plb
.add_time_avg(l_osd_op_before_queue_op_lat
, "op_before_queue_op_lat",
132 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
133 osd_plb
.add_time_avg(l_osd_op_before_dequeue_op_lat
, "op_before_dequeue_op_lat",
134 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
136 osd_plb
.add_u64_counter(
137 l_osd_sop
, "subop", "Suboperations");
138 osd_plb
.add_u64_counter(
139 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size", NULL
, 0, unit_t(UNIT_BYTES
));
140 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
142 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
143 osd_plb
.add_u64_counter(
144 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size", NULL
, 0, unit_t(UNIT_BYTES
));
145 osd_plb
.add_time_avg(
146 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
147 osd_plb
.add_u64_counter(
148 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
149 osd_plb
.add_time_avg(
150 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
151 osd_plb
.add_u64_counter(
152 l_osd_sop_push
, "subop_push", "Suboperations push messages");
153 osd_plb
.add_u64_counter(
154 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
155 osd_plb
.add_time_avg(
156 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
158 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
159 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
160 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
162 osd_plb
.add_u64_counter(
163 l_osd_rop
, "recovery_ops",
164 "Started recovery operations",
165 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
167 osd_plb
.add_u64_counter(
168 l_osd_rbytes
, "recovery_bytes",
170 "rbt", PerfCountersBuilder::PRIO_INTERESTING
);
172 osd_plb
.add_time_avg(
173 l_osd_recovery_push_queue_lat
,
174 "l_osd_recovery_push_queue_latency",
175 "MOSDPGPush queue latency");
176 osd_plb
.add_time_avg(
177 l_osd_recovery_push_reply_queue_lat
,
178 "l_osd_recovery_push_reply_queue_latency",
179 "MOSDPGPushReply queue latency");
180 osd_plb
.add_time_avg(
181 l_osd_recovery_pull_queue_lat
,
182 "l_osd_recovery_pull_queue_latency",
183 "MOSDPGPull queue latency");
184 osd_plb
.add_time_avg(
185 l_osd_recovery_backfill_queue_lat
,
186 "l_osd_recovery_backfill_queue_latency",
187 "MOSDPGBackfill queue latency");
188 osd_plb
.add_time_avg(
189 l_osd_recovery_backfill_remove_queue_lat
,
190 "l_osd_recovery_backfill_remove_queue_latency",
191 "MOSDPGBackfillDelete queue latency");
192 osd_plb
.add_time_avg(
193 l_osd_recovery_scan_queue_lat
,
194 "l_osd_recovery_scan_queue_latency",
195 "MOSDPGScan queue latency");
197 osd_plb
.add_time_avg(
198 l_osd_recovery_queue_lat
,
199 "l_osd_recovery_queue_latency",
200 "PGRecovery queue latency");
201 osd_plb
.add_time_avg(
202 l_osd_recovery_context_queue_lat
,
203 "l_osd_recovery_context_queue_latency",
204 "PGRecoveryContext queue latency");
206 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
208 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
210 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
211 "Total number getting crc from crc_cache with adjusting");
212 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
213 "Total number of crc cache misses");
215 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
216 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
218 l_osd_pg_primary
, "numpg_primary",
219 "Placement groups for which this osd is primary");
221 l_osd_pg_replica
, "numpg_replica",
222 "Placement groups for which this osd is replica");
224 l_osd_pg_stray
, "numpg_stray",
225 "Placement groups ready to be deleted from this osd");
227 l_osd_pg_removing
, "numpg_removing",
228 "Placement groups queued for local deletion", "pgsr",
229 PerfCountersBuilder::PRIO_USEFUL
);
231 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
232 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
233 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
234 osd_plb
.add_u64_counter(
235 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
236 osd_plb
.add_u64_counter(
237 l_osd_waiting_for_map
, "messages_delayed_for_map",
238 "Operations waiting for OSD map");
240 osd_plb
.add_u64_counter(
241 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
242 osd_plb
.add_u64_counter(
243 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
244 osd_plb
.add_u64_counter(
245 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
246 "osdmap cache miss below cache lower bound");
248 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
249 "osdmap cache miss, avg distance below cache lower bound");
250 osd_plb
.add_u64_counter(
251 l_osd_map_bl_cache_hit
, "osd_map_bl_cache_hit",
252 "OSDMap buffer cache hits");
253 osd_plb
.add_u64_counter(
254 l_osd_map_bl_cache_miss
, "osd_map_bl_cache_miss",
255 "OSDMap buffer cache misses");
258 l_osd_stat_bytes
, "stat_bytes", "OSD size", "size",
259 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
261 l_osd_stat_bytes_used
, "stat_bytes_used", "Used space", "used",
262 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
263 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space", NULL
, 0, unit_t(UNIT_BYTES
));
265 osd_plb
.add_u64_counter(
266 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
268 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
269 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
270 osd_plb
.add_u64_counter(
271 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
272 osd_plb
.add_u64_counter(
273 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
274 osd_plb
.add_u64_counter(
275 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
276 "Failed tier flush attempts");
277 osd_plb
.add_u64_counter(
278 l_osd_tier_evict
, "tier_evict", "Tier evictions");
279 osd_plb
.add_u64_counter(
280 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
281 osd_plb
.add_u64_counter(
282 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
283 osd_plb
.add_u64_counter(
284 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
285 osd_plb
.add_u64_counter(
286 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
287 osd_plb
.add_u64_counter(
288 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
289 osd_plb
.add_u64_counter(
290 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
292 osd_plb
.add_u64_counter(
293 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
294 osd_plb
.add_u64_counter(
295 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
296 osd_plb
.add_u64_counter(
297 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
298 osd_plb
.add_u64_counter(
299 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
301 osd_plb
.add_u64_counter(
302 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
303 osd_plb
.add_u64_counter(
304 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
306 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
307 osd_plb
.add_time_avg(
308 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
309 osd_plb
.add_time_avg(
310 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
311 osd_plb
.add_time_avg(
312 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
314 osd_plb
.add_u64_counter(
315 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
316 osd_plb
.add_u64_counter(
317 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
318 "PG updated its info using fastinfo attr");
319 osd_plb
.add_u64_counter(
320 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
322 return osd_plb
.create_perf_counters();
326 PerfCounters
*build_recoverystate_perf(CephContext
*cct
) {
327 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
329 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
330 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
331 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
332 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
333 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
334 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
335 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
336 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
337 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
338 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
339 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
340 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
341 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
342 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
343 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
344 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
345 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
346 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
347 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
348 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
349 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
350 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
351 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
352 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
353 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
354 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
355 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
356 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
357 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
358 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
359 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
361 return rs_perf
.create_perf_counters();