]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_perf_counters.cc
buildsys: switch source download to quincy
[ceph.git] / ceph / src / osd / osd_perf_counters.cc
CommitLineData
9f95a23c
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "osd_perf_counters.h"
5#include "include/common_fwd.h"
6
7
8PerfCounters *build_osd_logger(CephContext *cct) {
9 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
10
11 // Latency axis configuration for op histograms, values are in nanoseconds
12 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
13 "Latency (usec)",
14 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
15 0, ///< Start at 0
16 100000, ///< Quantization unit is 100usec
17 32, ///< Enough to cover much longer than slow requests
18 };
19
20 // Op size axis configuration for op histograms, values are in bytes
21 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
22 "Request size (bytes)",
23 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
24 0, ///< Start at 0
25 512, ///< Quantization unit is 512 bytes
26 32, ///< Enough to cover requests larger than GB
27 };
28
29
30 // All the basic OSD operation stats are to be considered useful
31 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
32
33 osd_plb.add_u64(
34 l_osd_op_wip, "op_wip",
35 "Replication operations currently being processed (primary)");
36 osd_plb.add_u64_counter(
37 l_osd_op, "op",
38 "Client operations",
39 "ops", PerfCountersBuilder::PRIO_CRITICAL);
40 osd_plb.add_u64_counter(
41 l_osd_op_inb, "op_in_bytes",
42 "Client operations total write size",
43 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
44 osd_plb.add_u64_counter(
45 l_osd_op_outb, "op_out_bytes",
46 "Client operations total read size",
47 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
48 osd_plb.add_time_avg(
49 l_osd_op_lat, "op_latency",
50 "Latency of client operations (including queue time)",
51 "l", 9);
52 osd_plb.add_time_avg(
53 l_osd_op_process_lat, "op_process_latency",
54 "Latency of client operations (excluding queue time)");
55 osd_plb.add_time_avg(
56 l_osd_op_prepare_lat, "op_prepare_latency",
57 "Latency of client operations (excluding queue time and wait for finished)");
58
59 osd_plb.add_u64_counter(
60 l_osd_op_r, "op_r", "Client read operations");
61 osd_plb.add_u64_counter(
62 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
63 osd_plb.add_time_avg(
64 l_osd_op_r_lat, "op_r_latency",
65 "Latency of read operation (including queue time)");
66 osd_plb.add_u64_counter_histogram(
67 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
68 op_hist_x_axis_config, op_hist_y_axis_config,
69 "Histogram of operation latency (including queue time) + data read");
70 osd_plb.add_time_avg(
71 l_osd_op_r_process_lat, "op_r_process_latency",
72 "Latency of read operation (excluding queue time)");
73 osd_plb.add_time_avg(
74 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
75 "Latency of read operations (excluding queue time and wait for finished)");
76 osd_plb.add_u64_counter(
77 l_osd_op_w, "op_w", "Client write operations");
78 osd_plb.add_u64_counter(
79 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
80 osd_plb.add_time_avg(
81 l_osd_op_w_lat, "op_w_latency",
82 "Latency of write operation (including queue time)");
83 osd_plb.add_u64_counter_histogram(
84 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
85 op_hist_x_axis_config, op_hist_y_axis_config,
86 "Histogram of operation latency (including queue time) + data written");
87 osd_plb.add_time_avg(
88 l_osd_op_w_process_lat, "op_w_process_latency",
89 "Latency of write operation (excluding queue time)");
90 osd_plb.add_time_avg(
91 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
92 "Latency of write operations (excluding queue time and wait for finished)");
93 osd_plb.add_u64_counter(
94 l_osd_op_rw, "op_rw",
95 "Client read-modify-write operations");
96 osd_plb.add_u64_counter(
97 l_osd_op_rw_inb, "op_rw_in_bytes",
98 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
99 osd_plb.add_u64_counter(
100 l_osd_op_rw_outb,"op_rw_out_bytes",
101 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
102 osd_plb.add_time_avg(
103 l_osd_op_rw_lat, "op_rw_latency",
104 "Latency of read-modify-write operation (including queue time)");
105 osd_plb.add_u64_counter_histogram(
106 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
107 op_hist_x_axis_config, op_hist_y_axis_config,
108 "Histogram of rw operation latency (including queue time) + data written");
109 osd_plb.add_u64_counter_histogram(
110 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
111 op_hist_x_axis_config, op_hist_y_axis_config,
112 "Histogram of rw operation latency (including queue time) + data read");
113 osd_plb.add_time_avg(
114 l_osd_op_rw_process_lat, "op_rw_process_latency",
115 "Latency of read-modify-write operation (excluding queue time)");
116 osd_plb.add_time_avg(
117 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
118 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
119
120 // Now we move on to some more obscure stats, revert to assuming things
121 // are low priority unless otherwise specified.
122 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
123
124 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
125 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
126 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
127 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
128
129 osd_plb.add_u64_counter(
130 l_osd_sop, "subop", "Suboperations");
131 osd_plb.add_u64_counter(
132 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
133 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
134
135 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
136 osd_plb.add_u64_counter(
137 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
138 osd_plb.add_time_avg(
139 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
140 osd_plb.add_u64_counter(
141 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
142 osd_plb.add_time_avg(
143 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
144 osd_plb.add_u64_counter(
145 l_osd_sop_push, "subop_push", "Suboperations push messages");
146 osd_plb.add_u64_counter(
147 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
148 osd_plb.add_time_avg(
149 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
150
151 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
152 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
153 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
154
155 osd_plb.add_u64_counter(
156 l_osd_rop, "recovery_ops",
157 "Started recovery operations",
158 "rop", PerfCountersBuilder::PRIO_INTERESTING);
159
160 osd_plb.add_u64_counter(
161 l_osd_rbytes, "recovery_bytes",
162 "recovery bytes",
163 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
164
165 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
166 osd_plb.add_u64(
167 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
168 osd_plb.add_u64(
169 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
170 "Total number getting crc from crc_cache with adjusting");
171 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
172 "Total number of crc cache misses");
173
174 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
175 "pgs", PerfCountersBuilder::PRIO_USEFUL);
176 osd_plb.add_u64(
177 l_osd_pg_primary, "numpg_primary",
178 "Placement groups for which this osd is primary");
179 osd_plb.add_u64(
180 l_osd_pg_replica, "numpg_replica",
181 "Placement groups for which this osd is replica");
182 osd_plb.add_u64(
183 l_osd_pg_stray, "numpg_stray",
184 "Placement groups ready to be deleted from this osd");
185 osd_plb.add_u64(
186 l_osd_pg_removing, "numpg_removing",
187 "Placement groups queued for local deletion", "pgsr",
188 PerfCountersBuilder::PRIO_USEFUL);
189 osd_plb.add_u64(
190 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
191 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
192 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
193 osd_plb.add_u64_counter(
194 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
195 osd_plb.add_u64_counter(
196 l_osd_waiting_for_map, "messages_delayed_for_map",
197 "Operations waiting for OSD map");
198
199 osd_plb.add_u64_counter(
200 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
201 osd_plb.add_u64_counter(
202 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
203 osd_plb.add_u64_counter(
204 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
205 "osdmap cache miss below cache lower bound");
206 osd_plb.add_u64_avg(
207 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
208 "osdmap cache miss, avg distance below cache lower bound");
209 osd_plb.add_u64_counter(
210 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
211 "OSDMap buffer cache hits");
212 osd_plb.add_u64_counter(
213 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
214 "OSDMap buffer cache misses");
215
216 osd_plb.add_u64(
217 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
218 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
219 osd_plb.add_u64(
220 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
221 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
222 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
223
224 osd_plb.add_u64_counter(
225 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
226
227 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
228 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
229 osd_plb.add_u64_counter(
230 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
231 osd_plb.add_u64_counter(
232 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
233 osd_plb.add_u64_counter(
234 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
235 "Failed tier flush attempts");
236 osd_plb.add_u64_counter(
237 l_osd_tier_evict, "tier_evict", "Tier evictions");
238 osd_plb.add_u64_counter(
239 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
240 osd_plb.add_u64_counter(
241 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
242 osd_plb.add_u64_counter(
243 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
244 osd_plb.add_u64_counter(
245 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
246 osd_plb.add_u64_counter(
247 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
248 osd_plb.add_u64_counter(
249 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
250
251 osd_plb.add_u64_counter(
252 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
253 osd_plb.add_u64_counter(
254 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
255 osd_plb.add_u64_counter(
256 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
257 osd_plb.add_u64_counter(
258 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
259
260 osd_plb.add_u64_counter(
261 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
262 osd_plb.add_u64_counter(
263 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
264
265 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
266 osd_plb.add_time_avg(
267 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
268 osd_plb.add_time_avg(
269 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
270 osd_plb.add_time_avg(
271 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
272
273 osd_plb.add_u64_counter(
274 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
275 osd_plb.add_u64_counter(
276 l_osd_pg_fastinfo, "osd_pg_fastinfo",
277 "PG updated its info using fastinfo attr");
278 osd_plb.add_u64_counter(
279 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
280
281 return osd_plb.create_perf_counters();
282}
283
284
285PerfCounters *build_recoverystate_perf(CephContext *cct) {
286 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
287
288 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
289 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
290 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
291 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
292 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
293 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
294 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
295 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
296 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
297 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
298 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
299 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
300 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
301 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
302 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
303 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
304 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
305 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
306 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
307 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
308 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
309 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
310 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
311 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
312 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
313 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
314 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
315 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
316 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
317 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
318 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
319
320 return rs_perf.create_perf_counters();
321}